Move to xautodl

2021-05-18 14:08:00 +00:00
parent 98fadf8086
commit 94a149b33f
149 changed files with 94 additions and 21 deletions
--- a/xautodl/xlayers/super_transformer.py
+++ b/xautodl/xlayers/super_transformer.py
@@ -0,0 +1,123 @@
+#####################################################
+# Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2021.03 #
+#####################################################
+from __future__ import division
+from __future__ import print_function
+
+import math
+from functools import partial
+from typing import Optional, Callable
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import spaces
+from .super_module import IntSpaceType
+from .super_module import BoolSpaceType
+from .super_module import LayerOrder
+from .super_module import SuperModule
+from .super_linear import SuperMLPv2
+from .super_norm import SuperLayerNorm1D
+from .super_attention import SuperAttention
+
+
+class SuperTransformerEncoderLayer(SuperModule):
+    """TransformerEncoderLayer is made up of self-attn and feedforward network.
+    This is a super model for TransformerEncoderLayer that can support search for the transformer encoder layer.
+
+    Reference:
+      - Paper: Attention Is All You Need, NeurIPS 2017
+      - PyTorch Implementation: https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html#TransformerEncoderLayer
+
+    Details:
+      the original post-norm version: MHA -> residual -> norm -> MLP -> residual -> norm
+      the pre-norm version: norm -> MHA -> residual -> norm -> MLP -> residual
+    """
+
+    def __init__(
+        self,
+        d_model: IntSpaceType,
+        num_heads: IntSpaceType,
+        qkv_bias: BoolSpaceType = False,
+        mlp_hidden_multiplier: IntSpaceType = 4,
+        drop: Optional[float] = None,
+        norm_affine: bool = True,
+        act_layer: Callable[[], nn.Module] = nn.GELU,
+        order: LayerOrder = LayerOrder.PreNorm,
+    ):
+        super(SuperTransformerEncoderLayer, self).__init__()
+        mha = SuperAttention(
+            d_model,
+            d_model,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=drop,
+            proj_drop=drop,
+        )
+        mlp = SuperMLPv2(
+            d_model,
+            hidden_multiplier=mlp_hidden_multiplier,
+            out_features=d_model,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        if order is LayerOrder.PreNorm:
+            self.norm1 = SuperLayerNorm1D(d_model, elementwise_affine=norm_affine)
+            self.mha = mha
+            self.drop1 = nn.Dropout(drop or 0.0)
+            self.norm2 = SuperLayerNorm1D(d_model, elementwise_affine=norm_affine)
+            self.mlp = mlp
+            self.drop2 = nn.Dropout(drop or 0.0)
+        elif order is LayerOrder.PostNorm:
+            self.mha = mha
+            self.drop1 = nn.Dropout(drop or 0.0)
+            self.norm1 = SuperLayerNorm1D(d_model, elementwise_affine=norm_affine)
+            self.mlp = mlp
+            self.drop2 = nn.Dropout(drop or 0.0)
+            self.norm2 = SuperLayerNorm1D(d_model, elementwise_affine=norm_affine)
+        else:
+            raise ValueError("Unknown order: {:}".format(order))
+        self._order = order
+
+    @property
+    def abstract_search_space(self):
+        root_node = spaces.VirtualNode(id(self))
+        xdict = dict(
+            mha=self.mha.abstract_search_space,
+            norm1=self.norm1.abstract_search_space,
+            mlp=self.mlp.abstract_search_space,
+            norm2=self.norm2.abstract_search_space,
+        )
+        for key, space in xdict.items():
+            if not spaces.is_determined(space):
+                root_node.append(key, space)
+        return root_node
+
+    def apply_candidate(self, abstract_child: spaces.VirtualNode):
+        super(SuperTransformerEncoderLayer, self).apply_candidate(abstract_child)
+        valid_keys = ["mha", "norm1", "mlp", "norm2"]
+        for key in valid_keys:
+            if key in abstract_child:
+                getattr(self, key).apply_candidate(abstract_child[key])
+
+    def forward_candidate(self, input: torch.Tensor) -> torch.Tensor:
+        return self.forward_raw(input)
+
+    def forward_raw(self, input: torch.Tensor) -> torch.Tensor:
+        if self._order is LayerOrder.PreNorm:
+            x = self.norm1(input)
+            x = x + self.drop1(self.mha(x))
+            x = self.norm2(x)
+            x = x + self.drop2(self.mlp(x))
+        elif self._order is LayerOrder.PostNorm:
+            # multi-head attention
+            x = self.mha(input)
+            x = x + self.drop1(x)
+            x = self.norm1(x)
+            # feed-forward layer
+            x = x + self.drop2(self.mlp(x))
+            x = self.norm2(x)
+        else:
+            raise ValueError("Unknown order: {:}".format(self._order))
+        return x