Skip to content

Commit e2ac16b

Browse files
authored
Large modular logic refactoring (#34487)
* rework converter * Update modular_model_converter.py * Update modular_model_converter.py * Update modular_model_converter.py * Update modular_model_converter.py * cleaning * cleaning * finalize imports * imports * Update modular_model_converter.py * Better renaming to avoid visiting same file multiple times * start converting files * style * address most comments * style * remove unused stuff in get_needed_imports * style * move class dependency functions outside class * Move main functions outside class * style * Update modular_model_converter.py * rename func * add augmented dependencies * Update modular_model_converter.py * Add types_to_file_type + tweak annotation handling * Allow assignment dependency mapping + fix regex * style + update modular examples * fix modular_roberta example (wrong redefinition of __init__) * slightly correct order in which dependencies will appear * style * review comments * Performance + better handling of dependencies when they are imported * style * Add advanced new classes capabilities * style * add forgotten check * Update modeling_llava_next_video.py * Add prority list ordering in check_conversion as well * Update check_modular_conversion.py * Update configuration_gemma.py
1 parent 86701f2 commit e2ac16b

19 files changed

+2668
-1600
lines changed

examples/modular-transformers/configuration_my_new_model.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2-
# This file was automatically generated from <path_to_modular_file.py>.
3-
# Do NOT edit this file manually as any edits will be overwritten by the generation of
4-
# the file from the modular. If any change should be done, please apply the change to the
5-
# modular_xxx.py file directly. One of our CI enforces this
6-
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
1+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2+
# This file was automatically generated from examples/modular-transformers/modular_my_new_model.py.
3+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
4+
# the file from the modular. If any change should be done, please apply the change to the
5+
# modular_my_new_model.py file directly. One of our CI enforces this.
6+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
77

88
from ...configuration_utils import PretrainedConfig
99
from ...modeling_rope_utils import rope_config_validation
@@ -158,6 +158,13 @@ def __init__(
158158
new_param=0,
159159
**kwargs,
160160
):
161+
super().__init__(
162+
pad_token_id=pad_token_id,
163+
bos_token_id=bos_token_id,
164+
eos_token_id=eos_token_id,
165+
tie_word_embeddings=tie_word_embeddings,
166+
**kwargs,
167+
)
161168
self.vocab_size = vocab_size
162169
self.max_position_embeddings = max_position_embeddings
163170
self.hidden_size = hidden_size
@@ -187,11 +194,3 @@ def __init__(
187194
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
188195
rope_config_validation(self)
189196
self.new_param = new_param
190-
191-
super().__init__(
192-
pad_token_id=pad_token_id,
193-
bos_token_id=bos_token_id,
194-
eos_token_id=eos_token_id,
195-
tie_word_embeddings=tie_word_embeddings,
196-
**kwargs,
197-
)

examples/modular-transformers/configuration_my_new_model2.py

+6-107
Original file line numberDiff line numberDiff line change
@@ -1,116 +1,16 @@
1-
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2-
# This file was automatically generated from <path_to_modular_file.py>.
3-
# Do NOT edit this file manually as any edits will be overwritten by the generation of
4-
# the file from the modular. If any change should be done, please apply the change to the
5-
# modular_xxx.py file directly. One of our CI enforces this
6-
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
1+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2+
# This file was automatically generated from examples/modular-transformers/modular_my_new_model2.py.
3+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
4+
# the file from the modular. If any change should be done, please apply the change to the
5+
# modular_my_new_model2.py file directly. One of our CI enforces this.
6+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
77

88
from ...configuration_utils import PretrainedConfig
99
from ...modeling_rope_utils import rope_config_validation
1010

1111

1212
class MyNewModel2Config(PretrainedConfig):
1313
r"""
14-
This is the configuration class to store the configuration of a [`MyNewModel2Model`]. It is used to instantiate an MyNewModel2
15-
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
16-
defaults will yield a similar configuration to that of the MyNewModel2-7B.
17-
18-
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
19-
documentation from [`PretrainedConfig`] for more information.
20-
21-
22-
Args:
23-
vocab_size (`int`, *optional*, defaults to 32000):
24-
Vocabulary size of the MyNewModel2 model. Defines the number of different tokens that can be represented by the
25-
`inputs_ids` passed when calling [`MyNewModel2Model`]
26-
hidden_size (`int`, *optional*, defaults to 4096):
27-
Dimension of the hidden representations.
28-
intermediate_size (`int`, *optional*, defaults to 11008):
29-
Dimension of the MLP representations.
30-
num_hidden_layers (`int`, *optional*, defaults to 32):
31-
Number of hidden layers in the Transformer decoder.
32-
num_attention_heads (`int`, *optional*, defaults to 32):
33-
Number of attention heads for each attention layer in the Transformer decoder.
34-
num_key_value_heads (`int`, *optional*):
35-
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
36-
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
37-
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
38-
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
39-
by meanpooling all the original heads within that group. For more details checkout [this
40-
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
41-
`num_attention_heads`.
42-
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
43-
The non-linear activation function (function or string) in the decoder.
44-
max_position_embeddings (`int`, *optional*, defaults to 2048):
45-
The maximum sequence length that this model might ever be used with. MyNewModel2 1 supports up to 2048 tokens,
46-
MyNewModel2 2 up to 4096, CodeMyNewModel2 up to 16384.
47-
initializer_range (`float`, *optional*, defaults to 0.02):
48-
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
49-
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
50-
The epsilon used by the rms normalization layers.
51-
use_cache (`bool`, *optional*, defaults to `True`):
52-
Whether or not the model should return the last key/values attentions (not used by all models). Only
53-
relevant if `config.is_decoder=True`.
54-
pad_token_id (`int`, *optional*):
55-
Padding token id.
56-
bos_token_id (`int`, *optional*, defaults to 1):
57-
Beginning of stream token id.
58-
eos_token_id (`int`, *optional*, defaults to 2):
59-
End of stream token id.
60-
pretraining_tp (`int`, *optional*, defaults to 1):
61-
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
62-
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
63-
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
64-
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
65-
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66-
Whether to tie weight embeddings
67-
rope_theta (`float`, *optional*, defaults to 10000.0):
68-
The base period of the RoPE embeddings.
69-
rope_scaling (`Dict`, *optional*):
70-
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
71-
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
72-
accordingly.
73-
Expected contents:
74-
`rope_type` (`str`):
75-
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
76-
'my_new_model23'], with 'default' being the original RoPE implementation.
77-
`factor` (`float`, *optional*):
78-
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
79-
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
80-
original maximum pre-trained length.
81-
`original_max_position_embeddings` (`int`, *optional*):
82-
Used with 'dynamic', 'longrope' and 'my_new_model23'. The original max position embeddings used during
83-
pretraining.
84-
`attention_factor` (`float`, *optional*):
85-
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
86-
computation. If unspecified, it defaults to value recommended by the implementation, using the
87-
`factor` field to infer the suggested value.
88-
`beta_fast` (`float`, *optional*):
89-
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
90-
ramp function. If unspecified, it defaults to 32.
91-
`beta_slow` (`float`, *optional*):
92-
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
93-
ramp function. If unspecified, it defaults to 1.
94-
`short_factor` (`List[float]`, *optional*):
95-
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
96-
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97-
size divided by the number of attention heads divided by 2
98-
`long_factor` (`List[float]`, *optional*):
99-
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
100-
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
101-
size divided by the number of attention heads divided by 2
102-
`low_freq_factor` (`float`, *optional*):
103-
Only used with 'my_new_model23'. Scaling factor applied to low frequency components of the RoPE
104-
`high_freq_factor` (`float`, *optional*):
105-
Only used with 'my_new_model23'. Scaling factor applied to high frequency components of the RoPE
106-
attention_bias (`bool`, *optional*, defaults to `False`):
107-
Whether to use a bias in the query, key, value and output projection layers during self-attention.
108-
attention_dropout (`float`, *optional*, defaults to 0.0):
109-
The dropout ratio for the attention probabilities.
110-
mlp_bias (`bool`, *optional*, defaults to `False`):
111-
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
112-
head_dim (`int`, *optional*):
113-
The attention head dimension. If None, it will default to hidden_size // num_heads
11414
This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
11515
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
11616
defaults will yield a similar configuration to that of the Gemma-7B.
@@ -121,7 +21,6 @@ class MyNewModel2Config(PretrainedConfig):
12121
vocab_size (`int`, *optional*, defaults to 256000):
12222
Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
12323
`inputs_ids` passed when calling [`GemmaModel`]
124-
12524
```python
12625
>>> from transformers import GemmaModel, GemmaConfig
12726
>>> # Initializing a Gemma gemma-7b style configuration

examples/modular-transformers/configuration_new_model.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2-
# This file was automatically generated from <path_to_modular_file.py>.
3-
# Do NOT edit this file manually as any edits will be overwritten by the generation of
4-
# the file from the modular. If any change should be done, please apply the change to the
5-
# modular_xxx.py file directly. One of our CI enforces this
6-
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
1+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2+
# This file was automatically generated from examples/modular-transformers/modular_new_model.py.
3+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
4+
# the file from the modular. If any change should be done, please apply the change to the
5+
# modular_new_model.py file directly. One of our CI enforces this.
6+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
77
# Example where we only want to overwrite the defaults of an init
88

99
from ...configuration_utils import PretrainedConfig
@@ -104,6 +104,13 @@ def __init__(
104104
attention_dropout=0.0,
105105
**kwargs,
106106
):
107+
super().__init__(
108+
pad_token_id=pad_token_id,
109+
bos_token_id=bos_token_id,
110+
eos_token_id=eos_token_id,
111+
tie_word_embeddings=tie_word_embeddings,
112+
**kwargs,
113+
)
107114
self.vocab_size = vocab_size
108115
self.max_position_embeddings = max_position_embeddings
109116
self.hidden_size = hidden_size
@@ -121,14 +128,6 @@ def __init__(
121128
self.attention_bias = attention_bias
122129
self.attention_dropout = attention_dropout
123130

124-
super().__init__(
125-
pad_token_id=pad_token_id,
126-
bos_token_id=bos_token_id,
127-
eos_token_id=eos_token_id,
128-
tie_word_embeddings=tie_word_embeddings,
129-
**kwargs,
130-
)
131-
132131
@property
133132
def num_heads(self):
134133
return self.num_attention_heads

0 commit comments

Comments
 (0)