Internlm_xcomposer2模型结构解读

项目地址

Internlm_xcomposer2模型总体结构

<class 'transformers_modules.internlm-xcomposer2-4khd-7b.modeling_internlm_xcomposer2.InternLMXComposer2ForCausalLM'>
InternLMXComposer2ForCausalLM(
  (model): InternLM2Model(
    (tok_embeddings): Embedding(92544, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x InternLM2DecoderLayer(
        (attention): InternLM2FlashAttention2(
          (wqkv): PLoRA(
            in_features=4096, out_features=6144, bias=False
            (lora_dropout): Dropout(p=0.05, inplace=False)
            (Plora_A): Linear(in_features=4096, out_features=8, bias=False)
            (Plora_B): Linear(in_features=8, out_features=6144, bias=False)
          )
          (wo): PLoRA(
            in_features=4096, out_features=4096, bias=False
            (lora_dropout): Dropout(p=0.05, inplace=False)
            (Plora_A): Linear(in_features=4096, out_features=256, bias=False)
            (Plora_B): Linear(in_features=256, out_features=4096, bias=False)
          )
          (rotary_emb): InternLM2RotaryEmbedding()
        )
        (feed_forward): InternLM2MLP(
          (w1): PLoRA(
            in_features=4096, out_features=14336, bias=False
            (lora_dropout): Dropout(p=0.05, inplace=False)
            (Plora_A): Linear(in_features=4096, out_features=256, bias=False)
            (Plora_B): Linear(in_features=256, out_features=14336, bias=False)
          )
          (w3): PLoRA(
            in_features=4096, out_features=14336, bias=False
            (lora_dropout): Dropout(p=0.05, inplace=False)
            (Plora_A): Linear(in_features=4096, out_features=256, bias=False)
            (Plora_B): Linear(in_features=256, out_features=14336, bias=False)
          )
          (w2): PLoRA(
            in_features=14336, out_features=4096, bias=False
            (lora_dropout): Dropout(p=0.05, inplace=False)
            (Plora_A): Linear(in_features=14336, out_features=256, bias=False)
            (Plora_B): Linear(in_features=256, out_features=4096, bias=False)
          )
          (act_fn): SiLUActivation()
        )
        (attention_norm): InternLM2RMSNorm()
        (ffn_norm): InternLM2RMSNorm()
      )
    )
    (norm): InternLM2RMSNorm()
  )
  (output): Linear(in_features=4096, out_features=92544, bias=False)
  (vit): CLIPVisionTower(
    (vision_tower): CLIPVisionModel(
      (vision_model): CLIPVisionTransformer(
        (embeddings): CLIPVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
          (position_embedding): Embedding(577, 1024)
        )
        (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (encoder): CLIPEncoder(
          (layers): ModuleList(
            (0-23): 24 x CLIPEncoderLayer(
              (self_attn): CLIPAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
              )
              (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
              (mlp): CLIPMLP(
                (activation_fn): QuickGELUActivation()
                (fc1): Linear(in_features=1024, out_features=4096, bias=True)
                (fc2): Linear(in_features=4096, out_features=1024, bias=True)
              )
              (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            )
          )
        )
        (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (vision_proj): Sequential(
    (0): Linear(in_features=4096, out_features=4096, bias=True)
    (1): GELU(approximate='none')
    (2): Linear(in_features=4096, out_features=4096, bias=True)
  )
)

Internlm_xcomposer2模型详细结构(下面是从输入到输出的顺序输出的每层的参数量)


plora_glb_GN: torch.Size([1, 1, 4096])
plora_sub_GN: torch.Size([1, 1, 1, 4096])
model.tok_embeddings.weight: torch.Size([92544, 4096])

#主体层(接受文本和后面vit图像的输入)
model.layers.0.attention.wqkv.weight: torch.Size([6144, 4096])
model.layers.0.attention.wqkv.Plora_A.weight: torch.Size([8, 4096])
model.layers.0.attention.wqkv.Plora_B.weight: torch.Size([6144, 8])
model.layers.0.attention.wo.weight: torch.Size([4096, 4096])
model.layers.0.attention.wo.Plora_A.weight: torch.Size([256, 4096])
model.layers.0.attention.wo.Plora_B.weight: torch.Size([4096, 256])
model.layers.0.feed_forward.w1.weight: torch.Size([14336, 4096])
model.layers.0.feed_forward.w1.Plora_A.weight: torch.Size([256, 4096])
model.layers.0.feed_forward.w1.Plora_B.weight: torch.Size([14336, 256])
model.layers.0.feed_forward.w3.weight: torch.Size([14336, 4096])
model.layers.0.feed_forward.w3.Plora_A.weight: torch.Size([256, 4096])
model.layers.0.feed_forward.w3.Plora_B.weight: torch.Size([14336, 256])
model.layers.0.feed_forward.w2.weight: torch.Size([4096, 14336])
model.layers.0.feed_forward.w2.Plora_A.weight: torch.Size([256, 14336])
model.layers.0.feed_forward.w2.Plora_B.weight: torch.Size([4096, 256])
model.layers.0.attention_norm.weight: torch.Size([4096])
model.layers.0.ffn_norm.weight: torch.Size([4096])

...32个model.layers.层,这里省略model.layers.1----model.layers.30

model.layers.31.attention.wqkv.weight: torch.Size([6144, 4096])
model.layers.31.attention.wqkv.Plora_A.weight: torch.Size([8, 4096])
model.layers.31.attention.wqkv.Plora_B.weight: torch.Size([6144, 8])
model.layers.31.attention.wo.weight: torch.Size([4096, 4096])
model.layers.31.attention.wo.Plora_A.weight: torch.Size([256, 4096])
model.layers.31.attention.wo.Plora_B.weight: torch.Size([4096, 256])
model.layers.31.feed_forward.w1.weight: torch.Size([14336, 4096])
model.layers.31.feed_forward.w1.Plora_A.weight: torch.Size([256, 4096])
model.layers.31.feed_forward.w1.Plora_B.weight: torch.Size([14336, 256])
model.layers.31.feed_forward.w3.weight: torch.Size([14336, 4096])
model.layers.31.feed_forward.w3.Plora_A.weight: torch.Size([256, 4096])
model.layers.31.feed_forward.w3.Plora_B.weight: torch.Size([14336, 256])
model.layers.31.feed_forward.w2.weight: torch.Size([4096, 14336])
model.layers.31.feed_forward.w2.Plora_A.weight: torch.Size([256, 14336])
model.layers.31.feed_forward.w2.Plora_B.weight: torch.Size([4096, 256])
model.layers.31.attention_norm.weight: torch.Size([4096])
model.layers.31.ffn_norm.weight: torch.Size([4096])

#输出层
model.norm.weight: torch.Size([4096])
output.weight: torch.Size([92544, 4096])


vit.vision_tower.vision_model.embeddings.class_embedding: torch.Size([1024])
vit.vision_tower.vision_model.embeddings.patch_embedding.weight: torch.Size([1024, 3, 14, 14])
vit.vision_tower.vision_model.embeddings.position_embedding.weight: torch.Size([577, 1024])
vit.vision_tower.vision_model.pre_layrnorm.weight: torch.Size([1024])
vit.vision_tower.vision_model.pre_layrnorm.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight: torch.Size([1024, 1024])
vit.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight: torch.Size([1024, 1024])
vit.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight: torch.Size([1024, 1024])
vit.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight: torch.Size([1024, 1024])
vit.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight: torch.Size([4096, 1024])
vit.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias: torch.Size([4096])
vit.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight: torch.Size([1024, 4096])
vit.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias: torch.Size([1024])

...24个vit.vision_tower.vision_model.encoder.layers层,这里省略vit.vision_tower.vision_model.encoder.layers.1----vit.vision_tower.vision_model.encoder.layers.22

vit.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight: torch.Size([1024, 1024])
vit.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight: torch.Size([1024, 1024])
vit.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight: torch.Size([1024, 1024])
vit.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight: torch.Size([1024, 1024])
vit.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight: torch.Size([4096, 1024])
vit.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias: torch.Size([4096])
vit.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight: torch.Size([1024, 4096])
vit.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight: torch.Size([1024])
vit.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias: torch.Size([1024])
vit.vision_tower.vision_model.post_layernorm.weight: torch.Size([1024])
vit.vision_tower.vision_model.post_layernorm.bias: torch.Size([1024])

#对齐到model.layers.0层的vision_proj
vision_proj.0.weight: torch.Size([4096, 4096])
vision_proj.0.bias: torch.Size([4096])
vision_proj.2.weight: torch.Size([4096, 4096])
vision_proj.2.bias: torch.Size([4096])

点赞(0) 打赏

评论列表 共有 0 条评论

暂无评论

微信公众账号

微信扫一扫加关注

发表
评论
返回
顶部