- 本文为365天深度学习训练营 中的学习记录博客
- 原作者:K同学啊
本周任务:
加载第N1周的.txt文件,使用Embeddingbag与Embedding完成词嵌入
Embedding
自定义数据集类
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
class MyDataset(Dataset):
def __init__(self,texts,labels):
self.texts = texts
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self,idx):
texts = self.texts[idx]
labels = self.labels[idx]
return texts,labels
定义填充函数
# 自定义填充函数
def collate_batch(batch):
texts,labels = zip(*batch)
max_len = max(len(text) for text in texts)
padded_texts = [F.pad(text,(0,max_len - len(text)),value=0) for text in texts]
padded_texts = torch.stack(padded_texts)
labels = torch.tensor(labels,dtype=torch.float).unsqueeze(1)
return padded_texts,labels
准备数据和数据加载器
# 准备数据和数据加载器
text_data = [
torch.tensor([1,1,1,1],dtype=torch.long),
torch.tensor([2,2,2],dtype=torch.long),
torch.tensor([3,3],dtype=torch.long)
]
labels = torch.tensor([4,5,6],dtype=torch.float)
my_dataset = MyDataset(text_data,labels)
data_loader = DataLoader(my_dataset,batch_size=2,shuffle=True,collate_fn=collate_batch)
for batch in data_loader:
print(batch)
定义模型
# 定义模型
class EmbeddingModel(nn.Module):
def __init__(self,vocab_size,embed_dim):
super(EmbeddingModel, self).__init__()
self.embedding = nn. Embedding(vocab_size,embed_dim)
self.fc = nn. Linear(embed_dim,1)
def forward(self,text):
print('embedding输入文本是:',text)
print('embedding输入文本shape:',text.shape)
embedding=self.embedding(text)
embedding_mean = embedding.mean(dim=1)
print('embedding输出文本shape:',embedding_mean.shape)
return self.fc(embedding_mean)
训练模型
vocab_size = 10
embed_dim = 6
model = EmbeddingModel(vocab_size, embed_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(),lr=0.01)
for epoch in range(1):
for batch in data_loader:
texts, labels = batch
outputs = model(texts)
loss = criterion(outputs,labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}, Loss: {loss.item()}')
Embeddingbag
前面的步骤相同
定义模型
# 定义模型
class EmbeddingBagModel(nn.Module):
def __init__(self, vocab_size, embed_dim):
super(EmbeddingBagModel,self).__init__()
self.embedding_bag = nn.EmbeddingBag(vocab_size,embed_dim,mode='mean')
self.fc = nn.Linear(embed_dim,1)
def forward(self,text,offsets):
print('embedding_bag输入文本是:',text)
print('embedding_bag输入文本shape:',text.shape)
embedded = self.embedding_bag(text,offsets)
print('embedding_bag输出文本shape:',embedded.shape)
return self.fc(embedded)
训练模型
vocab_size = 10
embed_dim = 6
model = EmbeddingBagModel(vocab_size,embed_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(),lr=0.01)
for epoch in range(1):
for batch in data_loader:
texts,labels = zip(*batch)
offsets = [0] + [len(text) for text in texts[:-1]]
offsets = torch.tensor(offsets).cumsum(dim=0)
texts = torch.cat(texts)
labels = torch.tensor(labels).unsqueeze(1)
outputs = model(texts,offsets)
loss = criterion(outputs,labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}, Loss: {loss.item()}')
使用任务文件.txt嵌入
Embedding
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import jieba
import numpy as np
# 自定义填充函数
def collate_batch(batch):
texts, labels = zip(*batch)
max_len = max(len(text) for text in texts)
padded_texts = [F.pad(text, (0, max_len - len(text)), value=0) for text in texts]
padded_texts = torch.stack(padded_texts)
labels = torch.tensor(labels, dtype=torch.float).unsqueeze(1)
return padded_texts, labels
# 从本地txt文件中读取文本内容
with open("F:/365data/N1/任务文件.txt", 'r', encoding='utf-8') as file:
texts1 = [line.strip() for line in file]
# 分词
tokenized_texts = [list(jieba.cut(text)) for text in texts1]
# 构建词汇表
word_index = {}
index_word = {}
for i, word in enumerate(set([word for text in tokenized_texts for word in text])):
word_index[word] = i
index_word[i] = word
# 计算词汇表大小
vocab_size = len(word_index) + 1 # +1是为了包括padding的0
# 将文本转换为序列
texts = [[word_index[word] for word in text] for text in tokenized_texts]
# 手动指定标签
# 假设第一行的标签为 1.0,第二行的标签为 2.0
labels = [1.0, 2.0]
# 定义自定义数据集类
class MyDataset(Dataset):
def __init__(self, texts, labels):
self.texts = texts
self.labels = labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.float)
# 创建数据集
my_dataset = MyDataset(texts, labels)
# 创建数据加载器
data_loader = DataLoader(my_dataset, batch_size=2, shuffle=True, collate_fn=collate_batch)
# 打印数据加载器中的批次
for batch in data_loader:
texts, labels = batch
print("texts:", texts)
print("Labels:", labels)
# 定义模型
class EmbeddingModel(nn.Module):
def __init__(self, vocab_size, embed_dim):
super(EmbeddingModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.fc = nn.Linear(embed_dim, 1)
def forward(self, text):
print('embedding输入文本是:', text)
print('embedding输入文本shape:', text.shape)
embedding = self.embedding(text)
embedding_mean = embedding.mean(dim=1)
print('embedding输出文本shape:', embedding_mean.shape)
return self.fc(embedding_mean)
embed_dim = 6
model = EmbeddingModel(vocab_size, embed_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for epoch in range(1):
for batch in data_loader:
texts, labels = batch
outputs = model(texts)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}, Loss: {loss.item()}')
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.566 seconds.
Prefix dict has been built successfully.
texts: tensor([[22, 45, 69, 23, 24, 70, 73, 34, 25, 75, 21, 52, 78, 62, 64, 21, 10, 56,
34, 25, 75, 21, 4, 42, 47, 27, 35, 32, 54, 16, 36, 7, 83, 24, 74, 80,
7, 81, 4, 15, 51, 17, 24, 67, 81, 4, 15, 51, 46, 56, 79, 24, 32, 54,
44, 8, 82, 66, 4, 24, 12, 49, 31, 71, 6, 59, 56, 65, 24, 38, 41, 54,
4, 20, 24, 58, 40, 60, 4, 34, 25, 75, 21, 68],
[85, 86, 4, 11, 27, 84, 57, 33, 4, 14, 1, 56, 65, 24, 38, 7, 5, 41,
54, 4, 23, 24, 58, 3, 17, 2, 77, 63, 72, 19, 55, 37, 41, 54, 56, 18,
24, 69, 11, 49, 7, 23, 24, 8, 28, 30, 76, 48, 50, 61, 39, 54, 44, 49,
0, 31, 71, 6, 59, 24, 9, 13, 29, 59, 30, 27, 12, 49, 4, 43, 12, 53,
26, 4, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Labels: tensor([[2.],
[1.]])
embedding输入文本是: tensor([[22, 45, 69, 23, 24, 70, 73, 34, 25, 75, 21, 52, 78, 62, 64, 21, 10, 56,
34, 25, 75, 21, 4, 42, 47, 27, 35, 32, 54, 16, 36, 7, 83, 24, 74, 80,
7, 81, 4, 15, 51, 17, 24, 67, 81, 4, 15, 51, 46, 56, 79, 24, 32, 54,
44, 8, 82, 66, 4, 24, 12, 49, 31, 71, 6, 59, 56, 65, 24, 38, 41, 54,
4, 20, 24, 58, 40, 60, 4, 34, 25, 75, 21, 68],
[85, 86, 4, 11, 27, 84, 57, 33, 4, 14, 1, 56, 65, 24, 38, 7, 5, 41,
54, 4, 23, 24, 58, 3, 17, 2, 77, 63, 72, 19, 55, 37, 41, 54, 56, 18,
24, 69, 11, 49, 7, 23, 24, 8, 28, 30, 76, 48, 50, 61, 39, 54, 44, 49,
0, 31, 71, 6, 59, 24, 9, 13, 29, 59, 30, 27, 12, 49, 4, 43, 12, 53,
26, 4, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
embedding输入文本shape: torch.Size([2, 84])
embedding输出文本shape: torch.Size([2, 6])
Epoch 1, Loss: 0.5101226568222046
EmbeddingBag
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import jieba
import numpy as np
# 自定义填充函数
def collate_batch(batch):
texts, labels = zip(*batch)
max_len = max(len(text) for text in texts)
padded_texts = [F.pad(text, (0, max_len - len(text)), value=0) for text in texts]
padded_texts = torch.stack(padded_texts)
labels = torch.tensor(labels, dtype=torch.float).unsqueeze(1)
return padded_texts, labels
# 从本地txt文件中读取文本内容
with open("F:/365data/N2/任务文件.txt", 'r', encoding='utf-8') as file:
texts1 = [line.strip() for line in file]
# 分词
tokenized_texts = [list(jieba.cut(text)) for text in texts1]
# 构建词汇表
word_index = {}
index_word = {}
for i, word in enumerate(set([word for text in tokenized_texts for word in text])):
word_index[word] = i
index_word[i] = word
# 计算词汇表大小
vocab_size = len(word_index) + 1 # +1是为了包括padding的0
# 将文本转换为序列
texts = [[word_index[word] for word in text] for text in tokenized_texts]
# 手动指定标签
# 假设第一行的标签为 1.0,第二行的标签为 2.0
labels = [1.0, 2.0]
# 定义自定义数据集类
class MyDataset(Dataset):
def __init__(self, texts, labels):
self.texts = texts
self.labels = labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.float)
# 创建数据集
my_dataset = MyDataset(texts, labels)
# 创建数据加载器
data_loader = DataLoader(my_dataset, batch_size=2, shuffle=True, collate_fn=collate_batch)
# 打印数据加载器中的批次
for batch in data_loader:
texts, labels = batch
print("texts:", texts)
print("Labels:", labels)
# 定义模型
class EmbeddingBagModel(nn.Module):
def __init__(self, vocab_size, embed_dim):
super(EmbeddingBagModel, self).__init__()
self.embedding_bag = nn.EmbeddingBag(vocab_size, embed_dim, mode='mean')
self.fc = nn.Linear(embed_dim, 1)
def forward(self, text):
print('embedding bag 输入文本是:', text)
print('embedding bag 输入文本shape:', text.shape)
embedding = self.embedding_bag(text)
print('embedding bag 输出文本shape:', embedding.shape)
return self.fc(embedding)
embed_dim = 6
model = EmbeddingBagModel(vocab_size, embed_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
for epoch in range(1):
for batch in data_loader:
texts, labels = batch
outputs = model(texts)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}, Loss: {loss.item()}')
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.490 seconds.
Prefix dict has been built successfully.
texts: tensor([[64, 17, 6, 48, 78, 52, 56, 8, 4, 49, 35, 19, 43, 41, 30, 35, 59, 34,
8, 4, 49, 35, 67, 38, 26, 50, 7, 21, 74, 70, 24, 84, 20, 78, 72, 37,
84, 82, 67, 73, 54, 22, 78, 57, 82, 67, 73, 54, 2, 34, 3, 78, 21, 74,
44, 66, 16, 11, 67, 78, 23, 63, 36, 85, 12, 0, 34, 79, 78, 13, 76, 74,
67, 42, 78, 58, 75, 86, 67, 8, 4, 49, 35, 80],
[61, 28, 67, 83, 50, 15, 25, 60, 67, 77, 18, 34, 79, 78, 13, 84, 81, 76,
74, 67, 48, 78, 58, 14, 22, 69, 5, 10, 32, 45, 46, 51, 76, 74, 34, 68,
78, 6, 83, 63, 84, 48, 78, 66, 9, 53, 31, 39, 40, 62, 33, 74, 44, 63,
47, 36, 85, 12, 0, 78, 1, 65, 55, 0, 53, 50, 23, 63, 67, 27, 23, 71,
29, 67, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Labels: tensor([[2.],
[1.]])
embedding bag 输入文本是: tensor([[64, 17, 6, 48, 78, 52, 56, 8, 4, 49, 35, 19, 43, 41, 30, 35, 59, 34,
8, 4, 49, 35, 67, 38, 26, 50, 7, 21, 74, 70, 24, 84, 20, 78, 72, 37,
84, 82, 67, 73, 54, 22, 78, 57, 82, 67, 73, 54, 2, 34, 3, 78, 21, 74,
44, 66, 16, 11, 67, 78, 23, 63, 36, 85, 12, 0, 34, 79, 78, 13, 76, 74,
67, 42, 78, 58, 75, 86, 67, 8, 4, 49, 35, 80],
[61, 28, 67, 83, 50, 15, 25, 60, 67, 77, 18, 34, 79, 78, 13, 84, 81, 76,
74, 67, 48, 78, 58, 14, 22, 69, 5, 10, 32, 45, 46, 51, 76, 74, 34, 68,
78, 6, 83, 63, 84, 48, 78, 66, 9, 53, 31, 39, 40, 62, 33, 74, 44, 63,
47, 36, 85, 12, 0, 78, 1, 65, 55, 0, 53, 50, 23, 63, 67, 27, 23, 71,
29, 67, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
embedding bag 输入文本shape: torch.Size([2, 84])
embedding bag 输出文本shape: torch.Size([2, 6])
Epoch 1, Loss: 0.6875726580619812
总结
- Embedding与EmbeddingBag都是将离散的词汇映射到一个低维的连续向量空间中,并且保持了词汇之间的语义关系
- EmbeddingBag是Embedding的优化,减少了计算和存储的开销
- 在将.txt文件嵌入时,首先需要用N1、N2中jieba分词法将文本内容导入并转换成向量
- 接着使用Dataset加载进DataLoader中并训练模型
- 值得注意的是导入文本之后,label需要手动设置
本站资源均来自互联网,仅供研究学习,禁止违法使用和商用,产生法律纠纷本站概不负责!如果侵犯了您的权益请与我们联系!
转载请注明出处: 免费源码网-免费的源码资源网站 » 第N4周:NLP中的文本嵌入
发表评论 取消回复