Commit f492db25 authored by Kartikay Khandelwal's avatar Kartikay Khandelwal Committed by Facebook Github Bot
Browse files

Refactor Fairseq models for BERT and XLM to use TransformerSentenceEncoder (#622)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/622

Updating some defaults to more meaningful values

Reviewed By: rutyrinott

Differential Revision: D14761263

fbshipit-source-id: 7ac670aa370f315ddfb511c63273583a6062c569
parent f040158a
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -37,8 +37,8 @@ class TransformerSentenceEncoderLayer(nn.Module):
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        encoder_normalize_before: bool = True,
        use_bert_layer_norm: bool = True,
        encoder_normalize_before: bool = False,
        use_bert_layer_norm: bool = False,
        use_gelu: bool = True,
    ) -> None:

@@ -108,6 +108,7 @@ class TransformerSentenceEncoderLayer(nn.Module):
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self._maybe_layer_norm(self.self_attn_layer_norm, x, after=True)

        residual = x
        x = self._maybe_layer_norm(self.final_layer_norm, x, before=True)
        x = self.activation_fn(self.fc1(x))