-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathmixtral.yaml
32 lines (29 loc) · 1.54 KB
/
mixtral.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Sharding config for mixtral
# Sharding should either be an int between 0 and rank - 1
# signifying the axis to shard or -1 / null signifying replicated
freqs_cis : -1 # torch.complex64 (2048, 64)
tok_embeddings.weight : 1 # torch.float32 (vocab_size, 4096)
tok_embeddings.weight_scaler : 0 # torch.bfloat16 (4096,)
layers.*.attention.wo.weight : 1 # torch.int8 (4096, 4096)
layers.*.attention.wo.weight_scaler : -1 # torch.bfloat16 (4096,)
layers.*.attention.wq.weight : 0 # torch.int8 (4096, 4096)
layers.*.attention.wq.weight_scaler : 0 # torch.bfloat16 (4096,)
layers.*.attention.wk.weight : 0 # torch.int8 (4096, 4096)
layers.*.attention.wk.weight_scaler : 0 # torch.bfloat16 (4096,)
layers.*.attention.wv.weight : 0 # torch.int8 (4096, 4096)
layers.*.attention.wv.weight_scaler : 0 # torch.bfloat16 (4096,)
layers.*.attention.wqkv.weight : 0 # torch.int8 (4096, 4096)
layers.*.attention.wqkv.weight_scaler : 0 # torch.bfloat16 (4096,)
layers.*.block_sparse_moe.gate.weight: -1
layers.*.block_sparse_moe.gate.weight_scaler: -1
layers.*.block_sparse_moe.cond_ffn.w1: 1
layers.*.block_sparse_moe.cond_ffn.w1_scaler: 1
layers.*.block_sparse_moe.cond_ffn.w2: 2
layers.*.block_sparse_moe.cond_ffn.w2_scaler: -1
layers.*.block_sparse_moe.cond_ffn.w3: 1
layers.*.block_sparse_moe.cond_ffn.w3_scaler: 1
layers.*.ffn_norm.weight : -1 # torch.float32 (4096,)
layers.*.attention_norm.weight : -1 # torch.float32 (4096,)
norm.weight : -1 # torch.float32 (4096,)
output.weight : 0 # torch.float32 (vocab_size, 4096)
output.weight_scaler : 0 # torch.float32 (4096,)