xlm-roberta-base


Find this model in the RoBERTa model summary
xlm-roberta-base Model Summary Plots





xlm-roberta-base Model Selected Details
  layer_type N M Q alpha D alpha-hat log_SN rand_D num_traps stable_rank rank_loss
layer_id                        
2 EMBEDDING 250002 768 325.52 2.37 0.04 14.78 6.25 0.38 1 4.89 0.00
3 EMBEDDING 768 514 1.49 2.53 0.06 8.10 3.20 0.72 1 1.94 1.00
12 DENSE 768 768 1.00 3.56 0.05 7.43 2.09 0.10 0 67.05 1.00
13 DENSE 768 768 1.00 7.72 0.07 17.98 2.33 0.24 0 41.72 1.00
14 DENSE 768 768 1.00 5.20 0.06 6.41 1.23 0.18 0 46.94 2.00
17 DENSE 768 768 1.00 4.03 0.06 6.75 1.67 0.17 1 16.76 2.00
21 DENSE 3072 768 4.00 2.53 0.04 7.94 3.14 0.29 1 7.66 0.00
24 DENSE 3072 768 4.00 6.28 0.06 11.59 1.85 0.08 0 72.74 0.00
30 DENSE 768 768 1.00 2.48 0.05 5.66 2.28 0.18 0 25.73 2.00
31 DENSE 768 768 1.00 2.55 0.08 5.19 2.04 0.19 0 44.30 2.00
32 DENSE 768 768 1.00 4.68 0.08 5.09 1.09 0.15 0 68.14 1.00
35 DENSE 768 768 1.00 4.63 0.07 4.63 1.00 0.11 0 68.76 2.00
39 DENSE 3072 768 4.00 2.62 0.03 8.08 3.09 0.20 1 8.35 0.00
42 DENSE 3072 768 4.00 5.09 0.02 9.01 1.77 0.11 0 79.82 0.00
48 DENSE 768 768 1.00 2.41 0.05 4.73 1.96 0.19 0 36.84 2.00
49 DENSE 768 768 1.00 2.63 0.05 5.22 1.98 0.19 0 35.11 2.00
50 DENSE 768 768 1.00 4.19 0.08 4.37 1.04 0.12 0 82.45 1.00
53 DENSE 768 768 1.00 3.31 0.04 4.46 1.35 0.16 0 30.99 3.00
57 DENSE 3072 768 4.00 2.67 0.03 8.09 3.03 0.19 1 8.81 0.00
60 DENSE 3072 768 4.00 3.20 0.06 5.79 1.81 0.17 0 87.94 0.00
66 DENSE 768 768 1.00 3.82 0.09 6.10 1.60 0.15 0 67.36 2.00
67 DENSE 768 768 1.00 3.69 0.07 5.75 1.56 0.12 0 74.83 2.00
68 DENSE 768 768 1.00 4.53 0.13 5.24 1.16 0.13 0 91.98 1.00
71 DENSE 768 768 1.00 2.43 0.06 3.42 1.41 0.20 0 45.03 1.00
75 DENSE 3072 768 4.00 3.22 0.03 10.09 3.14 0.18 1 7.80 0.00
78 DENSE 3072 768 4.00 4.77 0.03 10.40 2.18 0.17 0 47.94 0.00
84 DENSE 768 768 1.00 3.09 0.08 5.22 1.69 0.13 0 60.71 1.00
85 DENSE 768 768 1.00 3.23 0.07 5.44 1.68 0.13 0 64.18 1.00
86 DENSE 768 768 1.00 4.81 0.12 5.93 1.23 0.14 0 85.65 0.00
89 DENSE 768 768 1.00 2.03 0.08 3.59 1.77 0.25 0 33.71 3.00
93 DENSE 3072 768 4.00 3.59 0.04 11.75 3.27 0.20 1 6.65 0.00
96 DENSE 3072 768 4.00 4.56 0.03 10.96 2.40 0.20 0 32.96 0.00
102 DENSE 768 768 1.00 3.86 0.12 5.51 1.43 0.13 0 90.39 3.00
103 DENSE 768 768 1.00 3.28 0.06 5.34 1.62 0.15 0 60.23 4.00
104 DENSE 768 768 1.00 8.42 0.09 10.76 1.28 0.12 0 91.56 1.00
107 DENSE 768 768 1.00 1.89 0.12 3.00 1.59 0.23 0 56.04 2.00
111 DENSE 3072 768 4.00 3.75 0.04 12.24 3.27 0.20 1 6.31 0.00
114 DENSE 3072 768 4.00 4.04 0.03 9.77 2.42 0.24 0 29.25 0.00
120 DENSE 768 768 1.00 3.44 0.09 6.67 1.94 0.14 0 34.16 3.00
121 DENSE 768 768 1.00 3.24 0.10 5.06 1.56 0.14 0 83.46 4.00
122 DENSE 768 768 1.00 6.07 0.08 8.43 1.39 0.15 0 59.55 2.00
125 DENSE 768 768 1.00 1.96 0.10 3.20 1.64 0.25 0 45.92 2.00
129 DENSE 3072 768 4.00 3.62 0.05 11.68 3.23 0.20 1 6.47 0.00
132 DENSE 3072 768 4.00 3.56 0.03 9.40 2.64 0.26 0 17.25 0.00
138 DENSE 768 768 1.00 3.83 0.06 6.72 1.75 0.15 0 47.59 3.00
139 DENSE 768 768 1.00 3.40 0.09 5.41 1.59 0.17 0 71.46 4.00
140 DENSE 768 768 1.00 10.23 0.14 11.71 1.14 0.10 0 108.78 2.00
143 DENSE 768 768 1.00 2.10 0.08 3.48 1.66 0.20 0 37.25 2.00
147 DENSE 3072 768 4.00 3.58 0.05 11.24 3.14 0.19 1 7.18 0.00
150 DENSE 3072 768 4.00 3.77 0.03 9.70 2.57 0.25 0 18.85 0.00
156 DENSE 768 768 1.00 5.46 0.06 10.10 1.85 0.17 0 40.32 3.00
157 DENSE 768 768 1.00 3.90 0.13 6.02 1.54 0.17 0 82.93 4.00
158 DENSE 768 768 1.00 7.15 0.13 8.04 1.13 0.09 0 114.92 4.00
161 DENSE 768 768 1.00 1.90 0.07 3.25 1.71 0.28 0 38.54 1.00
165 DENSE 3072 768 4.00 3.61 0.04 11.02 3.06 0.19 1 8.17 0.00
168 DENSE 3072 768 4.00 3.61 0.02 8.90 2.47 0.23 0 24.46 0.00
174 DENSE 768 768 1.00 4.61 0.07 9.69 2.10 0.16 0 23.75 3.00
175 DENSE 768 768 1.00 3.82 0.09 6.17 1.62 0.16 0 72.21 4.00
176 DENSE 768 768 1.00 7.19 0.05 7.97 1.11 0.07 0 94.91 3.00
179 DENSE 768 768 1.00 4.00 0.07 6.61 1.65 0.15 0 26.88 1.00
183 DENSE 3072 768 4.00 3.54 0.05 10.49 2.96 0.17 1 8.60 0.00
186 DENSE 3072 768 4.00 5.24 0.06 12.78 2.44 0.17 0 24.62 0.00
192 DENSE 768 768 1.00 3.37 0.09 7.50 2.23 0.18 0 17.29 1.00
193 DENSE 768 768 1.00 13.18 0.10 20.77 1.58 0.16 0 77.81 3.00
194 DENSE 768 768 1.00 6.86 0.05 7.94 1.16 0.04 0 67.11 3.00
197 DENSE 768 768 1.00 5.25 0.07 9.13 1.74 0.14 0 13.30 1.00
201 DENSE 3072 768 4.00 3.50 0.02 9.33 2.66 0.17 1 11.43 0.00
204 DENSE 3072 768 4.00 4.69 0.08 12.43 2.65 0.21 0 11.68 0.00
210 DENSE 768 768 1.00 2.44 0.08 4.97 2.03 0.17 0 28.89 2.00
211 DENSE 768 768 1.00 5.95 0.10 10.22 1.72 0.12 0 66.87 2.00
212 DENSE 768 768 1.00 10.37 0.15 9.25 0.89 0.03 0 168.65 3.00
215 DENSE 768 768 1.00 4.26 0.08 7.92 1.86 0.16 1 11.22 1.00
219 DENSE 3072 768 4.00 2.67 0.03 6.32 2.37 0.21 1 21.68 0.00
222 DENSE 3072 768 4.00 3.71 0.09 10.35 2.79 0.30 7 5.40 0.00
226 DENSE 768 768 1.00 11.74 0.18 0.99 0.08 0.00 0 194.76 2.00

xlm-roberta-base Layer Plots
Layer 2
   Layer=2  |  N=250002  |  M=768  |  Q=325.52  |  alpha=2.37  |  D_ks=0.04  |  alpha-hat=14.78  |  num traps=1









Layer 3
   Layer=3  |  N=768  |  M=514  |  Q=1.49  |  alpha=2.53  |  D_ks=0.06  |  alpha-hat=8.10  |  num traps=1









Layer 12
   Layer=12  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.56  |  D_ks=0.05  |  alpha-hat=7.43  |  num traps=0









Layer 13
   Layer=13  |  N=768  |  M=768  |  Q=1.00  |  alpha=7.72  |  D_ks=0.07  |  alpha-hat=17.98  |  num traps=0









Layer 14
   Layer=14  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.20  |  D_ks=0.06  |  alpha-hat=6.41  |  num traps=0









Layer 17
   Layer=17  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.03  |  D_ks=0.06  |  alpha-hat=6.75  |  num traps=1









Layer 21
   Layer=21  |  N=3072  |  M=768  |  Q=4.00  |  alpha=2.53  |  D_ks=0.04  |  alpha-hat=7.94  |  num traps=1









Layer 24
   Layer=24  |  N=3072  |  M=768  |  Q=4.00  |  alpha=6.28  |  D_ks=0.06  |  alpha-hat=11.59  |  num traps=0









Layer 30
   Layer=30  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.48  |  D_ks=0.05  |  alpha-hat=5.66  |  num traps=0









Layer 31
   Layer=31  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.55  |  D_ks=0.08  |  alpha-hat=5.19  |  num traps=0









Layer 32
   Layer=32  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.68  |  D_ks=0.08  |  alpha-hat=5.09  |  num traps=0









Layer 35
   Layer=35  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.63  |  D_ks=0.07  |  alpha-hat=4.63  |  num traps=0









Layer 39
   Layer=39  |  N=3072  |  M=768  |  Q=4.00  |  alpha=2.62  |  D_ks=0.03  |  alpha-hat=8.08  |  num traps=1









Layer 42
   Layer=42  |  N=3072  |  M=768  |  Q=4.00  |  alpha=5.09  |  D_ks=0.02  |  alpha-hat=9.01  |  num traps=0









Layer 48
   Layer=48  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.41  |  D_ks=0.05  |  alpha-hat=4.73  |  num traps=0









Layer 49
   Layer=49  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.63  |  D_ks=0.05  |  alpha-hat=5.22  |  num traps=0









Layer 50
   Layer=50  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.19  |  D_ks=0.08  |  alpha-hat=4.37  |  num traps=0









Layer 53
   Layer=53  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.31  |  D_ks=0.04  |  alpha-hat=4.46  |  num traps=0









Layer 57
   Layer=57  |  N=3072  |  M=768  |  Q=4.00  |  alpha=2.67  |  D_ks=0.03  |  alpha-hat=8.09  |  num traps=1









Layer 60
   Layer=60  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.20  |  D_ks=0.06  |  alpha-hat=5.79  |  num traps=0









Layer 66
   Layer=66  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.82  |  D_ks=0.09  |  alpha-hat=6.10  |  num traps=0









Layer 67
   Layer=67  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.69  |  D_ks=0.07  |  alpha-hat=5.75  |  num traps=0









Layer 68
   Layer=68  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.53  |  D_ks=0.13  |  alpha-hat=5.24  |  num traps=0









Layer 71
   Layer=71  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.43  |  D_ks=0.06  |  alpha-hat=3.42  |  num traps=0









Layer 75
   Layer=75  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.22  |  D_ks=0.03  |  alpha-hat=10.09  |  num traps=1









Layer 78
   Layer=78  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.77  |  D_ks=0.03  |  alpha-hat=10.40  |  num traps=0









Layer 84
   Layer=84  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.09  |  D_ks=0.08  |  alpha-hat=5.22  |  num traps=0









Layer 85
   Layer=85  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.23  |  D_ks=0.07  |  alpha-hat=5.44  |  num traps=0









Layer 86
   Layer=86  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.81  |  D_ks=0.12  |  alpha-hat=5.93  |  num traps=0









Layer 89
   Layer=89  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.03  |  D_ks=0.08  |  alpha-hat=3.59  |  num traps=0









Layer 93
   Layer=93  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.59  |  D_ks=0.04  |  alpha-hat=11.75  |  num traps=1









Layer 96
   Layer=96  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.56  |  D_ks=0.03  |  alpha-hat=10.96  |  num traps=0









Layer 102
   Layer=102  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.86  |  D_ks=0.12  |  alpha-hat=5.51  |  num traps=0









Layer 103
   Layer=103  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.28  |  D_ks=0.06  |  alpha-hat=5.34  |  num traps=0









Layer 104
   Layer=104  |  N=768  |  M=768  |  Q=1.00  |  alpha=8.42  |  D_ks=0.09  |  alpha-hat=10.76  |  num traps=0









Layer 107
   Layer=107  |  N=768  |  M=768  |  Q=1.00  |  alpha=1.89  |  D_ks=0.12  |  alpha-hat=3.00  |  num traps=0









Layer 111
   Layer=111  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.75  |  D_ks=0.04  |  alpha-hat=12.24  |  num traps=1









Layer 114
   Layer=114  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.04  |  D_ks=0.03  |  alpha-hat=9.77  |  num traps=0









Layer 120
   Layer=120  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.44  |  D_ks=0.09  |  alpha-hat=6.67  |  num traps=0









Layer 121
   Layer=121  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.24  |  D_ks=0.10  |  alpha-hat=5.06  |  num traps=0









Layer 122
   Layer=122  |  N=768  |  M=768  |  Q=1.00  |  alpha=6.07  |  D_ks=0.08  |  alpha-hat=8.43  |  num traps=0









Layer 125
   Layer=125  |  N=768  |  M=768  |  Q=1.00  |  alpha=1.96  |  D_ks=0.10  |  alpha-hat=3.20  |  num traps=0









Layer 129
   Layer=129  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.62  |  D_ks=0.05  |  alpha-hat=11.68  |  num traps=1









Layer 132
   Layer=132  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.56  |  D_ks=0.03  |  alpha-hat=9.40  |  num traps=0









Layer 138
   Layer=138  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.83  |  D_ks=0.06  |  alpha-hat=6.72  |  num traps=0









Layer 139
   Layer=139  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.40  |  D_ks=0.09  |  alpha-hat=5.41  |  num traps=0









Layer 140
   Layer=140  |  N=768  |  M=768  |  Q=1.00  |  alpha=10.23  |  D_ks=0.14  |  alpha-hat=11.71  |  num traps=0









Layer 143
   Layer=143  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.10  |  D_ks=0.08  |  alpha-hat=3.48  |  num traps=0









Layer 147
   Layer=147  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.58  |  D_ks=0.05  |  alpha-hat=11.24  |  num traps=1









Layer 150
   Layer=150  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.77  |  D_ks=0.03  |  alpha-hat=9.70  |  num traps=0









Layer 156
   Layer=156  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.46  |  D_ks=0.06  |  alpha-hat=10.10  |  num traps=0









Layer 157
   Layer=157  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.90  |  D_ks=0.13  |  alpha-hat=6.02  |  num traps=0









Layer 158
   Layer=158  |  N=768  |  M=768  |  Q=1.00  |  alpha=7.15  |  D_ks=0.13  |  alpha-hat=8.04  |  num traps=0









Layer 161
   Layer=161  |  N=768  |  M=768  |  Q=1.00  |  alpha=1.90  |  D_ks=0.07  |  alpha-hat=3.25  |  num traps=0









Layer 165
   Layer=165  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.61  |  D_ks=0.04  |  alpha-hat=11.02  |  num traps=1









Layer 168
   Layer=168  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.61  |  D_ks=0.02  |  alpha-hat=8.90  |  num traps=0









Layer 174
   Layer=174  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.61  |  D_ks=0.07  |  alpha-hat=9.69  |  num traps=0









Layer 175
   Layer=175  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.82  |  D_ks=0.09  |  alpha-hat=6.17  |  num traps=0









Layer 176
   Layer=176  |  N=768  |  M=768  |  Q=1.00  |  alpha=7.19  |  D_ks=0.05  |  alpha-hat=7.97  |  num traps=0









Layer 179
   Layer=179  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.00  |  D_ks=0.07  |  alpha-hat=6.61  |  num traps=0









Layer 183
   Layer=183  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.54  |  D_ks=0.05  |  alpha-hat=10.49  |  num traps=1









Layer 186
   Layer=186  |  N=3072  |  M=768  |  Q=4.00  |  alpha=5.24  |  D_ks=0.06  |  alpha-hat=12.78  |  num traps=0









Layer 192
   Layer=192  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.37  |  D_ks=0.09  |  alpha-hat=7.50  |  num traps=0









Layer 193
   Layer=193  |  N=768  |  M=768  |  Q=1.00  |  alpha=13.18  |  D_ks=0.10  |  alpha-hat=20.77  |  num traps=0









Layer 194
   Layer=194  |  N=768  |  M=768  |  Q=1.00  |  alpha=6.86  |  D_ks=0.05  |  alpha-hat=7.94  |  num traps=0









Layer 197
   Layer=197  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.25  |  D_ks=0.07  |  alpha-hat=9.13  |  num traps=0









Layer 201
   Layer=201  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.50  |  D_ks=0.02  |  alpha-hat=9.33  |  num traps=1









Layer 204
   Layer=204  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.69  |  D_ks=0.08  |  alpha-hat=12.43  |  num traps=0









Layer 210
   Layer=210  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.44  |  D_ks=0.08  |  alpha-hat=4.97  |  num traps=0









Layer 211
   Layer=211  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.95  |  D_ks=0.10  |  alpha-hat=10.22  |  num traps=0









Layer 212
   Layer=212  |  N=768  |  M=768  |  Q=1.00  |  alpha=10.37  |  D_ks=0.15  |  alpha-hat=9.25  |  num traps=0









Layer 215
   Layer=215  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.26  |  D_ks=0.08  |  alpha-hat=7.92  |  num traps=1









Layer 219
   Layer=219  |  N=3072  |  M=768  |  Q=4.00  |  alpha=2.67  |  D_ks=0.03  |  alpha-hat=6.32  |  num traps=1









Layer 222
   Layer=222  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.71  |  D_ks=0.09  |  alpha-hat=10.35  |  num traps=7









Layer 226
   Layer=226  |  N=768  |  M=768  |  Q=1.00  |  alpha=11.74  |  D_ks=0.18  |  alpha-hat=0.99  |  num traps=0