forked from LAION-AI/Open-Assistant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathefficiency_utils.py
55 lines (40 loc) · 1.69 KB
/
efficiency_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import functools
import torch
from transformers.activations import FastGELUActivation, GELUActivation, NewGELUActivation, QuickGELUActivation
def rsetattr(obj, attr, val):
pre, _, post = attr.rpartition(".")
return setattr(rgetattr(obj, pre) if pre else obj, post, val)
def rgetattr(obj, attr, *args):
def _getattr(obj, attr):
return getattr(obj, attr, *args)
return functools.reduce(_getattr, [obj] + attr.split("."))
def fuse_gelu(model):
@torch.jit.script
def gelu_fwd(x):
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
@torch.jit.script
def gelu_bwd(g, x):
tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
return ff * g
class _FusedGeLUFunction(torch.autograd.Function):
@staticmethod
# bias is an optional argument
def forward(ctx, input):
ctx.input_tensor = input
return gelu_fwd(input)
@staticmethod
def backward(ctx, grad_output):
input = ctx.input_tensor
tmp = gelu_bwd(grad_output, input)
return tmp
class FusedGelu(torch.nn.Module):
def forward(self, input):
return _FusedGeLUFunction.apply(input)
fused_gelu_module = FusedGelu()
hf_gelu_functions = [GELUActivation, FastGELUActivation, NewGELUActivation, QuickGELUActivation]
for name, module in model.named_modules():
for hf_gelu_function in hf_gelu_functions:
if isinstance(module, hf_gelu_function):
rsetattr(model, name, fused_gelu_module)
return model