少女祈祷中...

Target: 训练一个MLC模型,使之能根据输入的源于CodeBERT的embedding预测原函数的token集

Conducted on 2022/9

train_eval.py

  • 训练参数
1
2
3
4
5
6
7
drop_prob1,drop_prob2 = 0.2,0.2 # 丢弃概率
learning_rate = 0.01
num_epochs = 20
batch_size = 128
vocab_size = 50264 # 分词词典长度
num_inputs,num_hidden1,num_hidden2,num_hidden3, num_outputs=515*768, 1024, 896, 896, vocab_size
conf_prob = 0.6 # 预测置信
  • 模型结构
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
class FlattenLayer(nn.Module):
def __init__(self):
super(FlattenLayer,self).__init__()
def forward(self,x):
return x.view(x.shape[0],-1)

net=nn.Sequential(
FlattenLayer(),
nn.Linear(num_inputs, num_hidden1),
nn.ReLU(),
nn.Dropout(drop_prob1),
nn.Linear(num_hidden1, num_hidden2),
nn.ReLU(),
nn.Dropout(drop_prob2),
nn.Linear(num_hidden2, num_hidden3),
nn.ReLU(),
nn.Linear(num_hidden3, num_outputs),
).to(device)
  • 数据集迭代器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def data_iter(data_split='validation'): 
batch_tot = 695 if data_split=='validation' else 617
for i in tqdm(range(batch_tot)):
path = (f'data/{data_split}/precessed_{i}.pth')
raw_data = torch.load(path)

for j in range(len(raw_data['embs'])):
# emb维度为[1,x,768],x为源函数片段的token个数
# 维数并不统一,故需要使用torch.cat() 进行拉伸操作
raw_emb = raw_data['embs'][j]
dim_2_len = raw_emb.size()[1]
tmp = torch.zeros((1,515-dim_2_len,768))
raw_data['embs'][j] = torch.cat((raw_emb, tmp), dim=1)

stacked_emb = torch.cat(raw_data['embs'], dim=0)
ori_token = raw_data['tokens']
out_vec = []

for t in range(len(ori_token)):
tmp = torch.zeros((1,50264), dtype=torch.long)
for it in iter(ori_token[t]):
tmp[0][it] = 1
out_vec.append(tmp)

stacked_token = torch.cat(out_vec, dim=0).to(device)
# yield中断,保证按随即列表全部取完
yield stacked_emb, stacked_token.float()
  • 模型训练&评价
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def train(net,train_iter,num_epochs):
net.train()
for epoch in range(num_epochs):
train_l_sum,train_acc_sum,n=0.0,0.0,0

for X, y in train_iter('validation'):
X = X.to(device)
y = y.to(device)
y_hat = net(X).to(device)
l = loss(y_hat, y).sum().to(device)
# w和b梯度清零
optimizer.zero_grad()
# 计算loss函数梯度
l.backward()
#梯度下降
optimizer.step()
# loss和精确度加和
train_l_sum += l.item()
n += y.shape[0]
print('epoch %d, loss %.4f' % (epoch + 1, train_l_sum / n))

def eval(net, test_iter):
net.eval()
p_true, p_hat, p_TP=0, 0, 0 # 真实正样本、预测正样本、正确预测正样本数

for X, y in test_iter('test'):
X = X.to(device)
y = y.to(device)
y_hat = net(X).to(device)
idy, idy_h = [], []
# print(y)
# print(y_hat)
for i in range(vocab_size):
if y[0][i]==1:
idy.append(i)
if y_hat[0][i] > conf_prob:
idy_h.append(i)
p_true += len(idy)
p_hat += len(idy_h)
for t in idy_h:
if t in idy:
p_TP +=1

Recall = p_TP / p_true
Precision = p_TP / p_hat
f1 = 2*(Recall*Precision)/(Recall+Precision)
print('Recall %.4f, Precision %.4f, F1 %.4f' % (Recall, Precision, f1))

train(net, data_iter, num_epochs)
eval(net, data_iter)