gpt2


Find this model in the GPT model summary
gpt2 Model Summary Plots





gpt2 Model Selected Details
  layer_type N M Q alpha D alpha-hat log_SN % Rand num_traps num_fingers rank_loss
layer_id                        
2 EMBEDDING 1024 768 1.33 1.55 0.08 5.56 3.58 22.11 0 0 0
8 CONV1D 2304 768 3.00 2.55 0.06 8.42 3.30 78.06 0 0 0
9 CONV1D 768 768 1.00 2.76 0.04 7.86 2.85 64.61 0 0 2
14 CONV1D 3072 768 4.00 3.98 0.04 12.91 3.25 85.60 0 0 0
15 CONV1D 3072 768 4.00 3.29 0.06 9.21 2.80 83.99 12 0 0
21 CONV1D 2304 768 3.00 3.21 0.04 8.73 2.72 78.94 0 0 0
22 CONV1D 768 768 1.00 2.54 0.06 7.28 2.86 70.13 11 0 1
27 CONV1D 3072 768 4.00 3.42 0.03 10.85 3.17 84.57 0 0 0
28 CONV1D 3072 768 4.00 3.47 0.05 10.13 2.92 87.37 7 0 0
34 CONV1D 2304 768 3.00 3.52 0.04 9.84 2.79 82.62 0 0 0
35 CONV1D 768 768 1.00 3.58 0.04 8.19 2.29 83.18 0 0 0
40 CONV1D 3072 768 4.00 3.18 0.03 10.09 3.18 83.79 2 0 0
41 CONV1D 3072 768 4.00 3.32 0.02 10.51 3.17 85.78 6 0 0
47 CONV1D 2304 768 3.00 3.22 0.03 8.59 2.67 85.65 0 0 0
48 CONV1D 768 768 1.00 4.69 0.04 10.44 2.23 85.59 0 0 1
53 CONV1D 3072 768 4.00 3.64 0.02 11.01 3.02 86.56 1 0 0
54 CONV1D 3072 768 4.00 3.52 0.04 11.33 3.22 84.38 4 0 0
60 CONV1D 2304 768 3.00 2.98 0.05 7.89 2.65 84.06 0 0 0
61 CONV1D 768 768 1.00 2.94 0.11 6.83 2.32 84.59 0 1 0
66 CONV1D 3072 768 4.00 3.45 0.03 10.36 3.00 85.18 0 0 0
67 CONV1D 3072 768 4.00 3.60 0.03 10.81 3.00 83.54 2 0 0
73 CONV1D 2304 768 3.00 2.99 0.09 7.29 2.44 82.91 0 0 0
74 CONV1D 768 768 1.00 3.65 0.04 8.10 2.22 87.42 0 0 0
79 CONV1D 3072 768 4.00 3.90 0.03 11.79 3.02 85.49 1 0 0
80 CONV1D 3072 768 4.00 3.59 0.04 10.36 2.88 85.05 0 0 0
86 CONV1D 2304 768 3.00 3.14 0.08 7.48 2.38 85.88 0 0 0
87 CONV1D 768 768 1.00 3.27 0.11 7.68 2.35 84.49 0 1 1
92 CONV1D 3072 768 4.00 3.50 0.04 10.49 2.99 85.73 0 0 0
93 CONV1D 3072 768 4.00 3.87 0.03 11.41 2.94 85.95 0 0 0
99 CONV1D 2304 768 3.00 3.65 0.08 8.81 2.42 86.45 0 0 0
100 CONV1D 768 768 1.00 3.09 0.08 7.63 2.47 86.17 0 1 1
105 CONV1D 3072 768 4.00 3.74 0.04 11.24 3.00 86.01 0 0 0
106 CONV1D 3072 768 4.00 4.05 0.02 12.01 2.96 87.09 0 0 0
112 CONV1D 2304 768 3.00 4.64 0.07 10.97 2.36 90.31 0 0 0
113 CONV1D 768 768 1.00 2.89 0.13 7.53 2.61 85.45 0 3 1
118 CONV1D 3072 768 4.00 4.01 0.03 12.00 2.99 87.00 0 0 0
119 CONV1D 3072 768 4.00 4.18 0.02 12.46 2.98 87.19 0 0 0
125 CONV1D 2304 768 3.00 4.60 0.06 10.80 2.35 92.75 0 0 0
126 CONV1D 768 768 1.00 3.87 0.13 10.45 2.70 89.30 0 6 1
131 CONV1D 3072 768 4.00 3.67 0.05 11.02 3.00 86.75 0 0 0
132 CONV1D 3072 768 4.00 4.56 0.03 13.83 3.03 88.38 0 0 0
138 CONV1D 2304 768 3.00 6.53 0.05 15.35 2.35 90.09 0 0 0
139 CONV1D 768 768 1.00 4.96 0.09 15.02 3.03 85.02 1 0 0
144 CONV1D 3072 768 4.00 3.59 0.03 10.84 3.02 86.21 0 0 0
145 CONV1D 3072 768 4.00 4.53 0.02 14.66 3.24 89.11 2 0 0
151 CONV1D 2304 768 3.00 2.48 0.12 5.83 2.35 88.45 0 6 0
152 CONV1D 768 768 1.00 2.99 0.11 10.38 3.48 70.57 17 0 0
157 CONV1D 3072 768 4.00 3.32 0.03 10.44 3.15 84.31 0 0 0
158 CONV1D 3072 768 4.00 4.22 0.04 15.08 3.57 86.95 1 0 0

gpt2 Layer Plots
Layer 2
   Layer=2  |  N=1024  |  M=768  |  Q=1.33  |  alpha=1.55  |  D_ks=0.08  |  alpha-hat=5.56  |  num traps=0









Layer 8
   Layer=8  |  N=2304  |  M=768  |  Q=3.00  |  alpha=2.55  |  D_ks=0.06  |  alpha-hat=8.42  |  num traps=0









Layer 9
   Layer=9  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.76  |  D_ks=0.04  |  alpha-hat=7.86  |  num traps=0









Layer 14
   Layer=14  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.98  |  D_ks=0.04  |  alpha-hat=12.91  |  num traps=0









Layer 15
   Layer=15  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.29  |  D_ks=0.06  |  alpha-hat=9.21  |  num traps=12









Layer 21
   Layer=21  |  N=2304  |  M=768  |  Q=3.00  |  alpha=3.21  |  D_ks=0.04  |  alpha-hat=8.73  |  num traps=0









Layer 22
   Layer=22  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.54  |  D_ks=0.06  |  alpha-hat=7.28  |  num traps=11









Layer 27
   Layer=27  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.42  |  D_ks=0.03  |  alpha-hat=10.85  |  num traps=0









Layer 28
   Layer=28  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.47  |  D_ks=0.05  |  alpha-hat=10.13  |  num traps=7









Layer 34
   Layer=34  |  N=2304  |  M=768  |  Q=3.00  |  alpha=3.52  |  D_ks=0.04  |  alpha-hat=9.84  |  num traps=0









Layer 35
   Layer=35  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.58  |  D_ks=0.04  |  alpha-hat=8.19  |  num traps=0









Layer 40
   Layer=40  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.18  |  D_ks=0.03  |  alpha-hat=10.09  |  num traps=2









Layer 41
   Layer=41  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.32  |  D_ks=0.02  |  alpha-hat=10.51  |  num traps=6









Layer 47
   Layer=47  |  N=2304  |  M=768  |  Q=3.00  |  alpha=3.22  |  D_ks=0.03  |  alpha-hat=8.59  |  num traps=0









Layer 48
   Layer=48  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.69  |  D_ks=0.04  |  alpha-hat=10.44  |  num traps=0









Layer 53
   Layer=53  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.64  |  D_ks=0.02  |  alpha-hat=11.01  |  num traps=1









Layer 54
   Layer=54  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.52  |  D_ks=0.04  |  alpha-hat=11.33  |  num traps=4









Layer 60
   Layer=60  |  N=2304  |  M=768  |  Q=3.00  |  alpha=2.98  |  D_ks=0.05  |  alpha-hat=7.89  |  num traps=0









Layer 61
   Layer=61  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.94  |  D_ks=0.11  |  alpha-hat=6.83  |  num traps=0









Layer 66
   Layer=66  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.45  |  D_ks=0.03  |  alpha-hat=10.36  |  num traps=0









Layer 67
   Layer=67  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.60  |  D_ks=0.03  |  alpha-hat=10.81  |  num traps=2









Layer 73
   Layer=73  |  N=2304  |  M=768  |  Q=3.00  |  alpha=2.99  |  D_ks=0.09  |  alpha-hat=7.29  |  num traps=0









Layer 74
   Layer=74  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.65  |  D_ks=0.04  |  alpha-hat=8.10  |  num traps=0









Layer 79
   Layer=79  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.90  |  D_ks=0.03  |  alpha-hat=11.79  |  num traps=1









Layer 80
   Layer=80  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.59  |  D_ks=0.04  |  alpha-hat=10.36  |  num traps=0









Layer 86
   Layer=86  |  N=2304  |  M=768  |  Q=3.00  |  alpha=3.14  |  D_ks=0.08  |  alpha-hat=7.48  |  num traps=0









Layer 87
   Layer=87  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.27  |  D_ks=0.11  |  alpha-hat=7.68  |  num traps=0









Layer 92
   Layer=92  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.50  |  D_ks=0.04  |  alpha-hat=10.49  |  num traps=0









Layer 93
   Layer=93  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.87  |  D_ks=0.03  |  alpha-hat=11.41  |  num traps=0









Layer 99
   Layer=99  |  N=2304  |  M=768  |  Q=3.00  |  alpha=3.65  |  D_ks=0.08  |  alpha-hat=8.81  |  num traps=0









Layer 100
   Layer=100  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.09  |  D_ks=0.08  |  alpha-hat=7.63  |  num traps=0









Layer 105
   Layer=105  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.74  |  D_ks=0.04  |  alpha-hat=11.24  |  num traps=0









Layer 106
   Layer=106  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.05  |  D_ks=0.02  |  alpha-hat=12.01  |  num traps=0









Layer 112
   Layer=112  |  N=2304  |  M=768  |  Q=3.00  |  alpha=4.64  |  D_ks=0.07  |  alpha-hat=10.97  |  num traps=0









Layer 113
   Layer=113  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.89  |  D_ks=0.13  |  alpha-hat=7.53  |  num traps=0









Layer 118
   Layer=118  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.01  |  D_ks=0.03  |  alpha-hat=12.00  |  num traps=0









Layer 119
   Layer=119  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.18  |  D_ks=0.02  |  alpha-hat=12.46  |  num traps=0









Layer 125
   Layer=125  |  N=2304  |  M=768  |  Q=3.00  |  alpha=4.60  |  D_ks=0.06  |  alpha-hat=10.80  |  num traps=0









Layer 126
   Layer=126  |  N=768  |  M=768  |  Q=1.00  |  alpha=3.87  |  D_ks=0.13  |  alpha-hat=10.45  |  num traps=0









Layer 131
   Layer=131  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.67  |  D_ks=0.05  |  alpha-hat=11.02  |  num traps=0









Layer 132
   Layer=132  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.56  |  D_ks=0.03  |  alpha-hat=13.83  |  num traps=0









Layer 138
   Layer=138  |  N=2304  |  M=768  |  Q=3.00  |  alpha=6.53  |  D_ks=0.05  |  alpha-hat=15.35  |  num traps=0









Layer 139
   Layer=139  |  N=768  |  M=768  |  Q=1.00  |  alpha=4.96  |  D_ks=0.09  |  alpha-hat=15.02  |  num traps=1









Layer 144
   Layer=144  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.59  |  D_ks=0.03  |  alpha-hat=10.84  |  num traps=0









Layer 145
   Layer=145  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.53  |  D_ks=0.02  |  alpha-hat=14.66  |  num traps=2









Layer 151
   Layer=151  |  N=2304  |  M=768  |  Q=3.00  |  alpha=2.48  |  D_ks=0.12  |  alpha-hat=5.83  |  num traps=0









Layer 152
   Layer=152  |  N=768  |  M=768  |  Q=1.00  |  alpha=2.99  |  D_ks=0.11  |  alpha-hat=10.38  |  num traps=17









Layer 157
   Layer=157  |  N=3072  |  M=768  |  Q=4.00  |  alpha=3.32  |  D_ks=0.03  |  alpha-hat=10.44  |  num traps=0









Layer 158
   Layer=158  |  N=3072  |  M=768  |  Q=4.00  |  alpha=4.22  |  D_ks=0.04  |  alpha-hat=15.08  |  num traps=1