@inproceedings{brody-etal-2023-expressivity,
title = "On the Expressivity Role of {L}ayer{N}orm in Transformers{'} Attention",
author = "Brody, Shaked and
Alon, Uri and
Yahav, Eran",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.895",
doi = "10.18653/v1/2023.findings-acl.895",
pages = "14211--14221",
abstract = "Layer Normalization (LayerNorm) is an inherent component in all Transformer-based models. In this paper, we show that LayerNorm is crucial to the expressivity of the multi-head attention layer that follows it. This is in contrast to the common belief that LayerNorm{'}s only role is to normalize the activations during the forward pass, and their gradients during the backward pass. We consider a geometric interpretation of LayerNorm and show that it consists of two components: (a) projection of the input vectors to a d-1 space that is orthogonal to the [1,1,...,1] vector, and(b) scaling of all vectors to the same norm of d. We show that each of these components is important for the attention layer that follows it in Transformers:(a) projection allows the attention mechanism to create an attention query that attends to all keys equally, offloading the need to learn this operation in the attention; and(b) scaling allows each key to potentially receive the highest attention, and prevents keys from being {``}un-select-able{''}.We show empirically that Transformers do indeed benefit from these properties of LayeNorm in general language modeling and even in computing simple functions such as {``}majority{''}. Our code is available at \url{https://github.com/tech-srl/layer_norm_expressivity_role} .",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="brody-etal-2023-expressivity">
<titleInfo>
<title>On the Expressivity Role of LayerNorm in Transformers’ Attention</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shaked</namePart>
<namePart type="family">Brody</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Uri</namePart>
<namePart type="family">Alon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eran</namePart>
<namePart type="family">Yahav</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Layer Normalization (LayerNorm) is an inherent component in all Transformer-based models. In this paper, we show that LayerNorm is crucial to the expressivity of the multi-head attention layer that follows it. This is in contrast to the common belief that LayerNorm’s only role is to normalize the activations during the forward pass, and their gradients during the backward pass. We consider a geometric interpretation of LayerNorm and show that it consists of two components: (a) projection of the input vectors to a d-1 space that is orthogonal to the [1,1,...,1] vector, and(b) scaling of all vectors to the same norm of d. We show that each of these components is important for the attention layer that follows it in Transformers:(a) projection allows the attention mechanism to create an attention query that attends to all keys equally, offloading the need to learn this operation in the attention; and(b) scaling allows each key to potentially receive the highest attention, and prevents keys from being “un-select-able”.We show empirically that Transformers do indeed benefit from these properties of LayeNorm in general language modeling and even in computing simple functions such as “majority”. Our code is available at https://github.com/tech-srl/layer_norm_expressivity_role .</abstract>
<identifier type="citekey">brody-etal-2023-expressivity</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.895</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.895</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>14211</start>
<end>14221</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On the Expressivity Role of LayerNorm in Transformers’ Attention
%A Brody, Shaked
%A Alon, Uri
%A Yahav, Eran
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F brody-etal-2023-expressivity
%X Layer Normalization (LayerNorm) is an inherent component in all Transformer-based models. In this paper, we show that LayerNorm is crucial to the expressivity of the multi-head attention layer that follows it. This is in contrast to the common belief that LayerNorm’s only role is to normalize the activations during the forward pass, and their gradients during the backward pass. We consider a geometric interpretation of LayerNorm and show that it consists of two components: (a) projection of the input vectors to a d-1 space that is orthogonal to the [1,1,...,1] vector, and(b) scaling of all vectors to the same norm of d. We show that each of these components is important for the attention layer that follows it in Transformers:(a) projection allows the attention mechanism to create an attention query that attends to all keys equally, offloading the need to learn this operation in the attention; and(b) scaling allows each key to potentially receive the highest attention, and prevents keys from being “un-select-able”.We show empirically that Transformers do indeed benefit from these properties of LayeNorm in general language modeling and even in computing simple functions such as “majority”. Our code is available at https://github.com/tech-srl/layer_norm_expressivity_role .
%R 10.18653/v1/2023.findings-acl.895
%U https://aclanthology.org/2023.findings-acl.895
%U https://doi.org/10.18653/v1/2023.findings-acl.895
%P 14211-14221
Markdown (Informal)
[On the Expressivity Role of LayerNorm in Transformers’ Attention](https://aclanthology.org/2023.findings-acl.895) (Brody et al., Findings 2023)
ACL