去评论
dz插件网

出现什么问题了,看不懂啊

逝水年华
2023/07/10 11:04:21
Loading model for gpu... Traceback (most recent call last):
  File "runEx_2.py", line 74, in <module>
    model.load_state_dict(m2)
  File "C:\Users\ADMIN\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\nn\modules\module.py", line 1406, in load_state_dict
    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for GPT:
        Unexpected key(s) in state_dict: "blocks.10.ln1.weight", "blocks.10.ln1.bias", "blocks.10.ln2.weight", "blocks.10.ln2.bias", "blocks.10.attn.time_w", "blocks.10.attn.time_alpha", "blocks.10.attn.time_beta", "blocks.10.attn.time_gamma", "blocks.10.attn.mask", "blocks.10.attn.key.weight", "blocks.10.attn.key.bias", "blocks.10.attn.value.weight", "blocks.10.attn.value.bias", "blocks.10.attn.receptance.weight", "blocks.10.attn.receptance.bias", "blocks.10.attn.output.weight", "blocks.10.attn.output.bias", "blocks.10.mlp.key.weight", "blocks.10.mlp.key.bias", "blocks.10.mlp.value.weight", "blocks.10.mlp.value.bias", "blocks.10.mlp.weight.weight", "blocks.10.mlp.weight.bias", "blocks.10.mlp.receptance.weight", "blocks.10.mlp.receptance.bias", "blocks.11.ln1.weight", "blocks.11.ln1.bias", "blocks.11.ln2.weight", "blocks.11.ln2.bias", "blocks.11.attn.time_w", "blocks.11.attn.time_alpha", "blocks.11.attn.time_beta", "blocks.11.attn.time_gamma", "blocks.11.attn.mask", "blocks.11.attn.key.weight", "blocks.11.attn.key.bias", "blocks.11.attn.value.weight", "blocks.11.attn.value.bias", "blocks.11.attn.receptance.weight", "blocks.11.attn.receptance.bias", "blocks.11.attn.output.weight", "blocks.11.attn.output.bias", "blocks.11.mlp.key.weight", "blocks.11.mlp.key.bias", "blocks.11.mlp.value.weight", "blocks.11.mlp.value.bias", "blocks.11.mlp.weight.weight", "blocks.11.mlp.weight.bias", "blocks.11.mlp.receptance.weight", "blocks.11.mlp.receptance.bias".
        size mismatch for tok_emb.weight: copying a param with shape torch.Size([4592, 768]) from checkpoint, the shape in current model is torch.Size([4592, 640]).
        size mismatch for blocks.0.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.0.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.0.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.0.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.0.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.0.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.0.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.0.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.0.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.0.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.0.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.0.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.0.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.1.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.1.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.1.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.1.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.1.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.1.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.1.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.1.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.1.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.1.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.1.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.1.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.2.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.2.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.2.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.2.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.2.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.2.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.2.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.2.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.2.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.2.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.2.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.2.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.3.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.3.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.3.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.3.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.3.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.3.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.3.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.3.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.3.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.3.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.3.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.3.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.4.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.4.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.4.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.4.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.4.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.4.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.4.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.4.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.4.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.4.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.4.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.4.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.5.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.5.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.5.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.5.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.5.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.5.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.5.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.5.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.5.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.5.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.5.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.5.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.6.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.6.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.6.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.6.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.6.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.6.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.6.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.6.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.6.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.6.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.6.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.6.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.7.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.7.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.7.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.7.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.7.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.7.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.7.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.7.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.7.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.7.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.7.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.7.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.8.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.8.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.8.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.8.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.8.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.8.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.8.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.8.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.8.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.8.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.8.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.8.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.ln1.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.ln1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.ln2.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.ln2.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.attn.time_ww: copying a param with shape torch.Size([12, 512, 512]) from checkpoint, the shape in current model is torch.Size([10, 512, 512]).
        size mismatch for blocks.9.attn.key.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.9.attn.key.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.attn.value.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.9.attn.value.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.attn.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.9.attn.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.attn.output.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.9.attn.output.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.mlp.key.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.9.mlp.key.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.9.mlp.value.weight: copying a param with shape torch.Size([1920, 768]) from checkpoint, the shape in current model is torch.Size([1600, 640]).
        size mismatch for blocks.9.mlp.value.bias: copying a param with shape torch.Size([1920]) from checkpoint, the shape in current model is torch.Size([1600]).
        size mismatch for blocks.9.mlp.weight.weight: copying a param with shape torch.Size([768, 1920]) from checkpoint, the shape in current model is torch.Size([640, 1600]).
        size mismatch for blocks.9.mlp.weight.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for blocks.9.mlp.receptance.weight: copying a param with shape torch.Size([768, 768]) from checkpoint, the shape in current model is torch.Size([640, 640]).
        size mismatch for blocks.9.mlp.receptance.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for ln_f.weight: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for ln_f.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([640]).
        size mismatch for head.weight: copying a param with shape torch.Size([4592, 768]) from checkpoint, the shape in current model is torch.Size([4592, 640]).