不同方法准确度
方法 | 轮数 | 准确度(训练数据) | 准确度(glove.6B.100d) |
lstm | 5 | 85.58% | 77.55% |
lstm | 10 | 88.93% | 88.76% |
lstm | 15 | 89.08% | 88.52% |
lstm | 20 | 88.81% | 88.14% |
conv | 5 | 86.68% | 86.26% |
conv | 10 | 85.62% | 86.10% |
conv | 15 | 85.82% | 85.51% |
conv | 20 | 86.44% | 85.82% |
transformer | 5 | 86.32% | - |
transformer | 10 | 84.52% | - |
lstm.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
num_epochs = 15
batch_size = 50
learning_rate = 0.001
max_sequence = 400
embedding_dim = 100
hidden_dim = 256
num_layers = 2
output_dim = 2
train_pipe, test_pipe = datasets.IMDB()
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator([tokenizer(text) for label, text in train_pipe], min_freq=2,
specials=["<unk>", "<pad>"], special_first=True)
vocab.set_default_index(vocab["<unk>"])
vocab_len = len(vocab)
padding_idx = vocab['<pad>']
class IMDBDataset(Dataset):
def __init__(self, pipe):
self.texts = []
self.labels = []
for label, text in pipe:
self.texts.append(text)
self.labels.append(label)
def __len__(self):
return len(self.texts)
def __getitem__(self, index):
text = self.texts[index]
label = self.labels[index] - 1
tokens = [vocab[token] for token in tokenizer(text)]
return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)
train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)
def collate_fn(batch):
texts, labels = zip(*batch)
texts = pad_sequence(texts, batch_first=True, padding_value=padding_idx)
return texts[:, :max_sequence], torch.tensor(labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class RNNModel(nn.Module):
def __init__(self):
super().__init__()
self.embedding = nn.Embedding(num_embeddings=vocab_len, embedding_dim=embedding_dim, padding_idx=padding_idx)
self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True,
dropout=0.5, bidirectional=True)
self.dropout = nn.Dropout()
self.fc = nn.Linear(hidden_dim * 2, output_dim)
def forward(self, x):
out = self.embedding(x)
out = self.dropout(out)
_, (hidden, _) = self.rnn(out)
out = torch.concat((hidden[-1], hidden[-2]), dim=1)
out = self.dropout(out)
return self.fc(out)
model = RNNModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for i, (texts, labels) in enumerate(train_loader):
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, i + 1, len(train_loader),
loss.item()))
model.eval()
with torch.no_grad():
total = 0
correct = 0
for texts, labels in test_loader:
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
_, prediction = torch.max(output, dim=1)
total += len(texts)
correct += (prediction == labels).sum().item()
print("Accuracy of {} test reviews is {}%".format(len(train_dataset), correct / total * 100))
glove.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
num_epochs = 10
batch_size = 50
learning_rate = 0.001
max_sequence = 400
embedding_dim = 100
hidden_dim = 256
num_layers = 2
output_dim = 2
train_pipe, test_pipe = datasets.IMDB()
tokenizer = get_tokenizer('basic_english')
vocab = GloVe(name="6B", dim=100)
vocab_len = len(vocab)
padding_idx = 0
class IMDBDataset(Dataset):
def __init__(self, pipe):
self.texts = []
self.labels = []
for label, text in pipe:
self.texts.append(text)
self.labels.append(label)
def __len__(self):
return len(self.texts)
def __getitem__(self, index):
text = self.texts[index]
label = self.labels[index] - 1
tokens = [vocab.stoi[token] if token in vocab.stoi else 0 for token in tokenizer(text)]
return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)
train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)
def collate_fn(batch):
texts, labels = zip(*batch)
texts = pad_sequence(texts, batch_first=True, padding_value=padding_idx)
return texts[:, :max_sequence], torch.tensor(labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class RNNModel(nn.Module):
def __init__(self):
super().__init__()
self.embedding = nn.Embedding(num_embeddings=vocab_len, embedding_dim=embedding_dim, padding_idx=padding_idx)
self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True,
dropout=0.5, bidirectional=True)
self.dropout = nn.Dropout()
self.fc = nn.Linear(hidden_dim * 2, output_dim)
def forward(self, x):
out = self.embedding(x)
out = self.dropout(out)
_, (hidden, _) = self.rnn(out)
out = torch.concat((hidden[-1], hidden[-2]), dim=1)
out = self.dropout(out)
return self.fc(out)
model = RNNModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for i, (texts, labels) in enumerate(train_loader):
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, i + 1, len(train_loader),
loss.item()))
model.eval()
with torch.no_grad():
total = 0
correct = 0
for texts, labels in test_loader:
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
_, prediction = torch.max(output, dim=1)
total += len(texts)
correct += (prediction == labels).sum().item()
print("Accuracy of {} test reviews is {}%".format(len(train_dataset), correct / total * 100))
cnn.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
num_epochs = 5
batch_size = 50
learning_rate = 0.001
max_sequence = 400
embedding_dim = 100
hidden_dim = 256
num_layers = 2
output_dim = 2
train_pipe, test_pipe = datasets.IMDB()
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator([tokenizer(text) for label, text in train_pipe], min_freq=2,
specials=["<unk>", "<pad>"], special_first=True)
vocab.set_default_index(vocab["<unk>"])
vocab_len = len(vocab)
padding_idx = vocab['<pad>']
class IMDBDataset(Dataset):
def __init__(self, pipe):
self.texts = []
self.labels = []
for label, text in pipe:
self.texts.append(text)
self.labels.append(label)
def __len__(self):
return len(self.texts)
def __getitem__(self, index):
text = self.texts[index]
label = self.labels[index] - 1
tokens = [vocab[token] for token in tokenizer(text)]
return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)
train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)
def collate_fn(batch):
texts, labels = zip(*batch)
texts = pad_sequence(texts, batch_first=True, padding_value=padding_idx)
return texts[:, :max_sequence], torch.tensor(labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CNNModel(nn.Module):
def __init__(self):
super().__init__()
self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=padding_idx)
self.cnn1 = nn.Sequential(
nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool1d(kernel_size=2, stride=2)
)
self.cnn2 = nn.Sequential(
nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool1d(kernel_size=2, stride=2)
)
self.flatten = nn.Flatten()
self.dropout = nn.Dropout(0.5)
self.fc = nn.Linear(256 * embedding_dim, output_dim)
def forward(self, x):
out = self.embedding(x)
out = out.permute(0, 2, 1)
out = self.cnn1(out)
out = self.cnn2(out)
out = self.dropout(out)
out = self.flatten(out)
return self.fc(out)
model = CNNModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for i, (texts, labels) in enumerate(train_loader):
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, i + 1, len(train_loader),
loss.item()))
model.eval()
with torch.no_grad():
total = 0
correct = 0
for texts, labels in test_loader:
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
_, prediction = torch.max(output, dim=1)
total += len(texts)
correct += (prediction == labels).sum().item()
print("Accuracy of {} test reviews is {}%".format(len(train_dataset), correct / total * 100))
cnn_glove.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
num_epochs = 5
batch_size = 50
learning_rate = 0.001
max_sequence = 400
embedding_dim = 100
hidden_dim = 256
num_layers = 2
output_dim = 2
train_pipe, test_pipe = datasets.IMDB()
tokenizer = get_tokenizer('basic_english')
vocab = GloVe("6B", dim=300)
vocab_len = len(vocab)
class IMDBDataset(Dataset):
def __init__(self, pipe):
self.texts = []
self.labels = []
for label, text in pipe:
self.texts.append(text)
self.labels.append(label)
def __len__(self):
return len(self.texts)
def __getitem__(self, index):
text = self.texts[index]
label = self.labels[index] - 1
tokens = [vocab.stoi[token] if token in vocab.stoi else 0 for token in tokenizer(text)]
return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)
train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)
def collate_fn(batch):
texts, labels = zip(*batch)
texts = pad_sequence(texts, batch_first=True, padding_value=0)
return texts[:, :max_sequence], torch.tensor(labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CNNModel(nn.Module):
def __init__(self):
super().__init__()
self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=0)
self.cnn1 = nn.Sequential(
nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool1d(kernel_size=2, stride=2)
)
self.cnn2 = nn.Sequential(
nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool1d(kernel_size=2, stride=2)
)
self.flatten = nn.Flatten()
self.dropout = nn.Dropout(0.5)
self.fc = nn.Linear(256 * embedding_dim, output_dim)
def forward(self, x):
out = self.embedding(x)
out = out.permute(0, 2, 1)
out = self.cnn1(out)
out = self.cnn2(out)
out = self.dropout(out)
out = self.flatten(out)
return self.fc(out)
model = CNNModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for i, (texts, labels) in enumerate(train_loader):
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, i + 1, len(train_loader),
loss.item()))
model.eval()
with torch.no_grad():
total = 0
correct = 0
for texts, labels in test_loader:
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
_, prediction = torch.max(output, dim=1)
total += len(texts)
correct += (prediction == labels).sum().item()
print("Accuracy of {} test reviews is {}%".format(len(train_dataset), correct / total * 100))
transformer.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
num_epochs = 5
batch_size = 50
learning_rate = 0.001
max_sequence = 400
embedding_dim = 96
hidden_dim = 256
num_layers = 2
output_dim = 2
nhead = 8
train_pipe, test_pipe = datasets.IMDB()
tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(
[tokenizer(text) for label, text in train_pipe],
min_freq=2,
specials=["<unk>", "<pad>"],
special_first=True,
)
vocab.set_default_index(vocab["<unk>"])
vocab_len = len(vocab)
padding_idx = vocab["<pad>"]
class IMDBDataset(Dataset):
def __init__(self, pipe):
self.texts = []
self.labels = []
for label, text in pipe:
self.texts.append(text)
self.labels.append(label)
def __len__(self):
return len(self.texts)
def __getitem__(self, index):
text = self.texts[index]
label = self.labels[index] - 1
tokens = [vocab[token] for token in tokenizer(text)]
return torch.tensor(tokens, dtype=torch.long), torch.tensor(
label, dtype=torch.long
)
train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)
def collate_fn(batch):
texts, labels = zip(*batch)
texts = pad_sequence(texts, batch_first=True, padding_value=padding_idx)
return texts[:, :max_sequence], torch.tensor(labels)
train_loader = DataLoader(
train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class TransformerModel(nn.Module):
def __init__(self):
super().__init__()
self.embedding = nn.Embedding(
num_embeddings=vocab_len,
embedding_dim=embedding_dim,
padding_idx=padding_idx,
)
self.positional_encoding = nn.Parameter(
torch.zeros(1, max_sequence, embedding_dim)
)
self.transformer = nn.Transformer(
d_model=embedding_dim,
nhead=nhead,
num_encoder_layers=num_layers,
num_decoder_layers=num_layers,
dim_feedforward=hidden_dim,
dropout=0.5,
batch_first=True,
)
self.fc = nn.Linear(embedding_dim, output_dim)
def forward(self, x):
seq_length = x.size(1)
x = self.embedding(x) + self.positional_encoding[:, :seq_length, :]
output = self.transformer(x, x)
output = output.mean(dim=1)
return self.fc(output)
model = TransformerModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for i, (texts, labels) in enumerate(train_loader):
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print(
"Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(
epoch + 1, num_epochs, i + 1, len(train_loader), loss.item()
)
)
model.eval()
with torch.no_grad():
total = 0
correct = 0
for texts, labels in test_loader:
texts = texts.to(device)
labels = labels.to(device)
output = model(texts)
_, prediction = torch.max(output, dim=1)
total += len(texts)
correct += (prediction == labels).sum().item()
print(
"Accuracy of {} test reviews is {}%".format(
len(train_dataset), correct / total * 100
)
)