From 19097d634415bd1325d2c1883727e2e58beccf9d Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 9 Apr 2024 00:56:22 -0400 Subject: [PATCH 1/6] Falcon chg --- src/transformers/models/falcon/configuration_falcon.py | 8 +++++++- src/transformers/models/falcon/modeling_falcon.py | 8 ++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 23ed4acb5e36..3d134a4bcbb7 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -128,6 +128,8 @@ def __init__( rope_scaling=None, bos_token_id=11, eos_token_id=11, + ffn_hidden_size=None, + activation='gelu', **kwargs, ): self.vocab_size = vocab_size @@ -141,7 +143,6 @@ def __init__( self.use_cache = use_cache self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads @@ -153,6 +154,11 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.rope_theta = rope_theta self.rope_scaling = rope_scaling + self.activation = activation + if ffn_hidden_size is None: + self.ffn_hidden_size = hidden_size * 4 + else: + self.ffn_hidden_size = ffn_hidden_size self._rope_scaling_validation() super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index c2f3f91d5c00..755ff6cd35ce 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -733,15 +733,15 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query (max_seqlen_in_batch_q, max_seqlen_in_batch_k), ) - +from transformers.activations import get_activation class FalconMLP(nn.Module): def __init__(self, config: FalconConfig): super().__init__() hidden_size = config.hidden_size - self.dense_h_to_4h = FalconLinear(hidden_size, 4 * hidden_size, bias=config.bias) - self.act = nn.GELU() - self.dense_4h_to_h = FalconLinear(4 * hidden_size, hidden_size, bias=config.bias) + self.dense_h_to_4h = FalconLinear(hidden_size, config.ffn_hidden_size, bias=config.bias) + self.act = get_activation(config.activation) + self.dense_4h_to_h = FalconLinear(config.ffn_hidden_size, hidden_size, bias=config.bias) self.hidden_dropout = config.hidden_dropout def forward(self, x: torch.Tensor) -> torch.Tensor: From 78c7653b2a135a69a3e314eaae21ee0ddc416a4f Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 9 Apr 2024 01:10:28 -0400 Subject: [PATCH 2/6] delta --- src/transformers/models/falcon/modeling_falcon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 755ff6cd35ce..97b14391ff05 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -23,6 +23,7 @@ from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss from torch.nn import functional as F +from transformers.activations import get_activation from ...modeling_attn_mask_utils import ( AttentionMaskConverter, @@ -733,7 +734,7 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query (max_seqlen_in_batch_q, max_seqlen_in_batch_k), ) -from transformers.activations import get_activation + class FalconMLP(nn.Module): def __init__(self, config: FalconConfig): super().__init__() From 0da4bc62847bbe6042f6daf51f02c5177429ce1f Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 9 Apr 2024 14:55:51 -0400 Subject: [PATCH 3/6] Docstring --- src/transformers/models/falcon/configuration_falcon.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 3d134a4bcbb7..edeb24b90bb4 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -12,7 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Falcon configuration""" +"""Falcon configuration""" + from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -87,6 +88,10 @@ class FalconConfig(PretrainedConfig): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 11): The id of the "end-of-sequence" token. + ffn_hidden_size (`int`, *optional*, defaults to 4x hidden dim): + The hidden size of the feedforward layer in the Transformer decoder. + activation (`str`, *optional*, defaults to "gelu"): + The activation function used in the feedforward layer. Example: @@ -129,7 +134,7 @@ def __init__( bos_token_id=11, eos_token_id=11, ffn_hidden_size=None, - activation='gelu', + activation="gelu", **kwargs, ): self.vocab_size = vocab_size From 4fa99f1ac5d55faf3772d5187cd1e7abb6a93546 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 9 Apr 2024 15:28:10 -0400 Subject: [PATCH 4/6] Fix import block --- src/transformers/models/falcon/modeling_falcon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 97b14391ff05..d9254bec0a73 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -23,8 +23,8 @@ from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss from torch.nn import functional as F -from transformers.activations import get_activation +from ...activations import get_activation from ...modeling_attn_mask_utils import ( AttentionMaskConverter, _prepare_4d_causal_attention_mask, From 609353dd54f8291acd4059b197318566fcf3fe81 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 9 Apr 2024 15:32:17 -0400 Subject: [PATCH 5/6] doc --- src/transformers/models/falcon/configuration_falcon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index edeb24b90bb4..909544701901 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -88,8 +88,9 @@ class FalconConfig(PretrainedConfig): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 11): The id of the "end-of-sequence" token. - ffn_hidden_size (`int`, *optional*, defaults to 4x hidden dim): + ffn_hidden_size (`int`, *optional*): The hidden size of the feedforward layer in the Transformer decoder. + defaults to 4x hidden dim activation (`str`, *optional*, defaults to "gelu"): The activation function used in the feedforward layer. From 5be0cc3caba04e3bc5da122f592c5d0078a1c7a1 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Wed, 10 Apr 2024 15:49:10 +0000 Subject: [PATCH 6/6] fix and overwrite --- src/transformers/models/falcon/configuration_falcon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 909544701901..61d202b09608 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -91,7 +91,7 @@ class FalconConfig(PretrainedConfig): ffn_hidden_size (`int`, *optional*): The hidden size of the feedforward layer in the Transformer decoder. defaults to 4x hidden dim - activation (`str`, *optional*, defaults to "gelu"): + activation (`str`, *optional*, defaults to `"gelu"`): The activation function used in the feedforward layer. Example: