roberta-base


Find this model in the RoBERTa model summary
roberta-base Model Summary Plots





roberta-base Model Selected Details
  layer_type N M Q alpha D alpha-hat log_SN rand_D num_traps stable_rank rank_loss
layer_id                        
2 EMBEDDING 50265 768 65.45 2.91 0.05 12.66 4.35 0.24 1 30.34 0.00
3 EMBEDDING 768 514 1.49 1.51 0.10 4.31 2.86 0.72 0 2.38 1.00
12 DENSE 768 768 1.00 3.54 0.04 7.15 2.02 0.12 0 58.82 0.00
13 DENSE 768 768 1.00 2.89 0.07 6.10 2.11 0.18 0 46.84 2.00
14 DENSE 768 768 1.00 3.37 0.12 3.40 1.01 0.12 0 83.50 1.00
17 DENSE 768 768 1.00 4.20 0.05 6.93 1.65 0.18 0 19.30 1.00
21 DENSE 3072 768 4.00 2.77 0.03 7.36 2.66 0.19 1 16.16 0.00
24 DENSE 3072 768 4.00 3.48 0.03 6.71 1.93 0.14 0 56.94 0.00
30 DENSE 768 768 1.00 2.86 0.05 6.27 2.19 0.16 0 21.32 1.00
31 DENSE 768 768 1.00 2.68 0.05 4.99 1.86 0.18 0 43.59 1.00
32 DENSE 768 768 1.00 5.65 0.10 6.86 1.21 0.15 0 63.08 1.00
35 DENSE 768 768 1.00 2.49 0.08 3.26 1.31 0.17 0 41.78 1.00
39 DENSE 3072 768 4.00 3.15 0.02 8.79 2.79 0.15 1 13.97 0.00
42 DENSE 3072 768 4.00 3.31 0.05 6.14 1.85 0.15 0 72.90 0.00
48 DENSE 768 768 1.00 3.26 0.06 7.01 2.15 0.16 0 19.01 1.00
49 DENSE 768 768 1.00 2.97 0.04 5.05 1.70 0.16 0 52.75 1.00
50 DENSE 768 768 1.00 5.32 0.07 7.40 1.39 0.14 0 51.02 2.00
53 DENSE 768 768 1.00 3.59 0.09 4.36 1.21 0.13 0 57.65 1.00
57 DENSE 3072 768 4.00 3.82 0.02 10.51 2.75 0.13 1 15.92 0.00
60 DENSE 3072 768 4.00 4.54 0.05 8.67 1.91 0.15 0 67.96 0.00
66 DENSE 768 768 1.00 3.48 0.06 7.74 2.23 0.16 0 17.97 2.00
67 DENSE 768 768 1.00 3.46 0.06 5.74 1.66 0.12 0 62.06 1.00
68 DENSE 768 768 1.00 5.13 0.05 8.14 1.59 0.14 0 33.79 3.00
71 DENSE 768 768 1.00 3.37 0.07 4.12 1.22 0.13 0 65.07 1.00
75 DENSE 3072 768 4.00 3.86 0.03 10.39 2.69 0.14 1 17.74 0.00
78 DENSE 3072 768 4.00 4.28 0.04 8.86 2.07 0.15 0 44.78 0.00
84 DENSE 768 768 1.00 3.45 0.04 7.67 2.22 0.16 0 18.95 2.00
85 DENSE 768 768 1.00 3.37 0.06 5.66 1.68 0.13 0 61.14 1.00
86 DENSE 768 768 1.00 5.46 0.07 8.21 1.50 0.16 0 42.82 1.00
89 DENSE 768 768 1.00 3.78 0.08 4.61 1.22 0.13 0 63.95 2.00
93 DENSE 3072 768 4.00 3.84 0.03 10.15 2.64 0.14 1 17.38 0.00
96 DENSE 3072 768 4.00 4.69 0.05 9.10 1.94 0.15 0 52.14 0.00
102 DENSE 768 768 1.00 3.90 0.06 8.77 2.25 0.16 0 15.87 1.00
103 DENSE 768 768 1.00 4.16 0.08 6.60 1.59 0.11 0 67.73 3.00
104 DENSE 768 768 1.00 5.34 0.11 6.90 1.29 0.14 0 79.64 1.00
107 DENSE 768 768 1.00 4.04 0.07 5.44 1.35 0.15 0 52.37 1.00
111 DENSE 3072 768 4.00 3.89 0.02 9.71 2.49 0.14 1 21.80 0.00
114 DENSE 3072 768 4.00 4.03 0.03 7.89 1.96 0.15 0 44.20 0.00
120 DENSE 768 768 1.00 6.59 0.09 10.80 1.64 0.14 0 64.38 2.00
121 DENSE 768 768 1.00 2.52 0.11 3.93 1.56 0.15 0 71.40 2.00
122 DENSE 768 768 1.00 8.27 0.06 10.01 1.21 0.11 0 92.01 2.00
125 DENSE 768 768 1.00 4.21 0.04 6.80 1.61 0.12 0 27.92 1.00
129 DENSE 3072 768 4.00 4.03 0.03 10.66 2.65 0.13 1 16.86 0.00
132 DENSE 3072 768 4.00 4.14 0.02 8.54 2.07 0.16 0 38.34 0.00
138 DENSE 768 768 1.00 4.00 0.08 6.70 1.67 0.13 0 68.60 2.00
139 DENSE 768 768 1.00 3.68 0.06 6.34 1.72 0.15 0 57.70 2.00
140 DENSE 768 768 1.00 6.97 0.07 8.26 1.19 0.12 0 70.73 1.00
143 DENSE 768 768 1.00 4.96 0.05 6.31 1.27 0.10 0 42.32 1.00
147 DENSE 3072 768 4.00 3.91 0.03 10.09 2.58 0.13 1 19.84 0.00
150 DENSE 3072 768 4.00 3.81 0.04 8.42 2.21 0.17 0 28.86 0.00
156 DENSE 768 768 1.00 3.93 0.06 7.22 1.84 0.15 0 39.51 2.00
157 DENSE 768 768 1.00 3.62 0.08 6.04 1.67 0.16 0 55.42 2.00
158 DENSE 768 768 1.00 6.14 0.04 8.19 1.34 0.06 0 53.64 2.00
161 DENSE 768 768 1.00 5.68 0.06 7.49 1.32 0.07 0 38.14 1.00
165 DENSE 3072 768 4.00 3.67 0.03 9.92 2.70 0.14 1 14.26 0.00
168 DENSE 3072 768 4.00 3.96 0.03 8.52 2.15 0.16 0 33.47 0.00
174 DENSE 768 768 1.00 4.04 0.07 7.96 1.97 0.18 0 27.69 2.00
175 DENSE 768 768 1.00 3.29 0.10 5.44 1.65 0.16 0 55.41 2.00
176 DENSE 768 768 1.00 4.67 0.05 7.01 1.50 0.08 0 32.07 3.00
179 DENSE 768 768 1.00 5.10 0.07 6.76 1.33 0.08 0 34.44 1.00
183 DENSE 3072 768 4.00 3.64 0.03 9.65 2.65 0.14 1 12.90 0.00
186 DENSE 3072 768 4.00 4.68 0.04 9.72 2.08 0.12 0 37.68 0.00
192 DENSE 768 768 1.00 4.75 0.08 10.09 2.12 0.17 0 19.82 1.00
193 DENSE 768 768 1.00 6.05 0.14 8.84 1.46 0.15 0 84.78 2.00
194 DENSE 768 768 1.00 5.56 0.06 7.87 1.41 0.07 0 39.00 3.00
197 DENSE 768 768 1.00 4.97 0.07 6.84 1.38 0.08 0 31.87 1.00
201 DENSE 3072 768 4.00 3.69 0.03 9.09 2.47 0.14 1 15.74 0.00
204 DENSE 3072 768 4.00 4.85 0.08 10.39 2.14 0.12 0 28.68 0.00
210 DENSE 768 768 1.00 4.54 0.04 9.54 2.10 0.17 0 21.82 2.00
211 DENSE 768 768 1.00 7.23 0.05 11.39 1.58 0.12 0 70.65 2.00
212 DENSE 768 768 1.00 7.61 0.10 9.82 1.29 0.05 0 66.03 3.00
215 DENSE 768 768 1.00 5.13 0.08 7.02 1.37 0.07 0 37.41 0.00
219 DENSE 3072 768 4.00 3.57 0.02 9.15 2.56 0.13 1 13.65 0.00
222 DENSE 3072 768 4.00 4.66 0.09 10.84 2.33 0.18 0 13.62 0.00
226 DENSE 768 768 1.00 45.11 0.17 3.98 0.09 0.00 0 192.40 1.00

roberta-base Layer Plots
Layer 2
   Layer=2  |  N=50265  |  M=768  |  Q=65.45  |  alpha=2.91  |  D_ks=0.05  |  alpha-hat=12.66  |  num traps=1









Layer 3
   Layer=3  |  N=768  |  M=514  |  Q=1.49  |  alpha=1.51  |  D_ks=0.10  |  alpha-hat=4.31  |  num traps=0









Layer 12
   Layer=12  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.54  |  D_ks=0.04  |  alpha-hat=7.15  |  num traps=0









Layer 13
   Layer=13  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.89  |  D_ks=0.07  |  alpha-hat=6.10  |  num traps=0









Layer 14
   Layer=14  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.37  |  D_ks=0.12  |  alpha-hat=3.40  |  num traps=0









Layer 17
   Layer=17  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.20  |  D_ks=0.05  |  alpha-hat=6.93  |  num traps=0









Layer 21
   Layer=21  |  N=3072  |  M=768  |  Q=4.00  |  alpha=2.77  |  D_ks=0.03  |  alpha-hat=7.36  |  num traps=1









Layer 24
   Layer=24  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.48  |  D_ks=0.03  |  alpha-hat=6.71  |  num traps=0









Layer 30
   Layer=30  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.86  |  D_ks=0.05  |  alpha-hat=6.27  |  num traps=0









Layer 31
   Layer=31  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.68  |  D_ks=0.05  |  alpha-hat=4.99  |  num traps=0









Layer 32
   Layer=32  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.65  |  D_ks=0.10  |  alpha-hat=6.86  |  num traps=0









Layer 35
   Layer=35  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.49  |  D_ks=0.08  |  alpha-hat=3.26  |  num traps=0









Layer 39
   Layer=39  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.15  |  D_ks=0.02  |  alpha-hat=8.79  |  num traps=1









Layer 42
   Layer=42  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.31  |  D_ks=0.05  |  alpha-hat=6.14  |  num traps=0









Layer 48
   Layer=48  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.26  |  D_ks=0.06  |  alpha-hat=7.01  |  num traps=0









Layer 49
   Layer=49  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.97  |  D_ks=0.04  |  alpha-hat=5.05  |  num traps=0









Layer 50
   Layer=50  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.32  |  D_ks=0.07  |  alpha-hat=7.40  |  num traps=0









Layer 53
   Layer=53  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.59  |  D_ks=0.09  |  alpha-hat=4.36  |  num traps=0









Layer 57
   Layer=57  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.82  |  D_ks=0.02  |  alpha-hat=10.51  |  num traps=1









Layer 60
   Layer=60  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.54  |  D_ks=0.05  |  alpha-hat=8.67  |  num traps=0









Layer 66
   Layer=66  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.48  |  D_ks=0.06  |  alpha-hat=7.74  |  num traps=0









Layer 67
   Layer=67  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.46  |  D_ks=0.06  |  alpha-hat=5.74  |  num traps=0









Layer 68
   Layer=68  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.13  |  D_ks=0.05  |  alpha-hat=8.14  |  num traps=0









Layer 71
   Layer=71  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.37  |  D_ks=0.07  |  alpha-hat=4.12  |  num traps=0









Layer 75
   Layer=75  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.86  |  D_ks=0.03  |  alpha-hat=10.39  |  num traps=1









Layer 78
   Layer=78  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.28  |  D_ks=0.04  |  alpha-hat=8.86  |  num traps=0









Layer 84
   Layer=84  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.45  |  D_ks=0.04  |  alpha-hat=7.67  |  num traps=0









Layer 85
   Layer=85  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.37  |  D_ks=0.06  |  alpha-hat=5.66  |  num traps=0









Layer 86
   Layer=86  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.46  |  D_ks=0.07  |  alpha-hat=8.21  |  num traps=0









Layer 89
   Layer=89  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.78  |  D_ks=0.08  |  alpha-hat=4.61  |  num traps=0









Layer 93
   Layer=93  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.84  |  D_ks=0.03  |  alpha-hat=10.15  |  num traps=1









Layer 96
   Layer=96  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.69  |  D_ks=0.05  |  alpha-hat=9.10  |  num traps=0









Layer 102
   Layer=102  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.90  |  D_ks=0.06  |  alpha-hat=8.77  |  num traps=0









Layer 103
   Layer=103  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.16  |  D_ks=0.08  |  alpha-hat=6.60  |  num traps=0









Layer 104
   Layer=104  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.34  |  D_ks=0.11  |  alpha-hat=6.90  |  num traps=0









Layer 107
   Layer=107  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.04  |  D_ks=0.07  |  alpha-hat=5.44  |  num traps=0









Layer 111
   Layer=111  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.89  |  D_ks=0.02  |  alpha-hat=9.71  |  num traps=1









Layer 114
   Layer=114  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.03  |  D_ks=0.03  |  alpha-hat=7.89  |  num traps=0









Layer 120
   Layer=120  |  N=768  |  M=768  |  Q=1.00  |  alpha=6.59  |  D_ks=0.09  |  alpha-hat=10.80  |  num traps=0









Layer 121
   Layer=121  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.52  |  D_ks=0.11  |  alpha-hat=3.93  |  num traps=0









Layer 122
   Layer=122  |  N=768  |  M=768  |  Q=1.00  |  alpha=8.27  |  D_ks=0.06  |  alpha-hat=10.01  |  num traps=0









Layer 125
   Layer=125  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.21  |  D_ks=0.04  |  alpha-hat=6.80  |  num traps=0









Layer 129
   Layer=129  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.03  |  D_ks=0.03  |  alpha-hat=10.66  |  num traps=1









Layer 132
   Layer=132  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.14  |  D_ks=0.02  |  alpha-hat=8.54  |  num traps=0









Layer 138
   Layer=138  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.00  |  D_ks=0.08  |  alpha-hat=6.70  |  num traps=0









Layer 139
   Layer=139  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.68  |  D_ks=0.06  |  alpha-hat=6.34  |  num traps=0









Layer 140
   Layer=140  |  N=768  |  M=768  |  Q=1.00  |  alpha=6.97  |  D_ks=0.07  |  alpha-hat=8.26  |  num traps=0









Layer 143
   Layer=143  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.96  |  D_ks=0.05  |  alpha-hat=6.31  |  num traps=0









Layer 147
   Layer=147  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.91  |  D_ks=0.03  |  alpha-hat=10.09  |  num traps=1









Layer 150
   Layer=150  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.81  |  D_ks=0.04  |  alpha-hat=8.42  |  num traps=0









Layer 156
   Layer=156  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.93  |  D_ks=0.06  |  alpha-hat=7.22  |  num traps=0









Layer 157
   Layer=157  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.62  |  D_ks=0.08  |  alpha-hat=6.04  |  num traps=0









Layer 158
   Layer=158  |  N=768  |  M=768  |  Q=1.00  |  alpha=6.14  |  D_ks=0.04  |  alpha-hat=8.19  |  num traps=0









Layer 161
   Layer=161  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.68  |  D_ks=0.06  |  alpha-hat=7.49  |  num traps=0









Layer 165
   Layer=165  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.67  |  D_ks=0.03  |  alpha-hat=9.92  |  num traps=1









Layer 168
   Layer=168  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.96  |  D_ks=0.03  |  alpha-hat=8.52  |  num traps=0









Layer 174
   Layer=174  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.04  |  D_ks=0.07  |  alpha-hat=7.96  |  num traps=0









Layer 175
   Layer=175  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.29  |  D_ks=0.10  |  alpha-hat=5.44  |  num traps=0









Layer 176
   Layer=176  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.67  |  D_ks=0.05  |  alpha-hat=7.01  |  num traps=0









Layer 179
   Layer=179  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.10  |  D_ks=0.07  |  alpha-hat=6.76  |  num traps=0









Layer 183
   Layer=183  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.64  |  D_ks=0.03  |  alpha-hat=9.65  |  num traps=1









Layer 186
   Layer=186  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.68  |  D_ks=0.04  |  alpha-hat=9.72  |  num traps=0









Layer 192
   Layer=192  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.75  |  D_ks=0.08  |  alpha-hat=10.09  |  num traps=0









Layer 193
   Layer=193  |  N=768  |  M=768  |  Q=1.00  |  alpha=6.05  |  D_ks=0.14  |  alpha-hat=8.84  |  num traps=0









Layer 194
   Layer=194  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.56  |  D_ks=0.06  |  alpha-hat=7.87  |  num traps=0









Layer 197
   Layer=197  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.97  |  D_ks=0.07  |  alpha-hat=6.84  |  num traps=0









Layer 201
   Layer=201  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.69  |  D_ks=0.03  |  alpha-hat=9.09  |  num traps=1









Layer 204
   Layer=204  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.85  |  D_ks=0.08  |  alpha-hat=10.39  |  num traps=0









Layer 210
   Layer=210  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.54  |  D_ks=0.04  |  alpha-hat=9.54  |  num traps=0









Layer 211
   Layer=211  |  N=768  |  M=768  |  Q=1.00  |  alpha=7.23  |  D_ks=0.05  |  alpha-hat=11.39  |  num traps=0









Layer 212
   Layer=212  |  N=768  |  M=768  |  Q=1.00  |  alpha=7.61  |  D_ks=0.10  |  alpha-hat=9.82  |  num traps=0









Layer 215
   Layer=215  |  N=768  |  M=768  |  Q=1.00  |  alpha=5.13  |  D_ks=0.08  |  alpha-hat=7.02  |  num traps=0









Layer 219
   Layer=219  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.57  |  D_ks=0.02  |  alpha-hat=9.15  |  num traps=1









Layer 222
   Layer=222  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.66  |  D_ks=0.09  |  alpha-hat=10.84  |  num traps=0









Layer 226
   Layer=226  |  N=768  |  M=768  |  Q=1.00  |  alpha=45.11  |  D_ks=0.17  |  alpha-hat=3.98  |  num traps=0