使用PyTorch实现电影评论情感分类

创建日期:2025-02-09
更新日期:2025-03-12

不同方法准确度

方法轮数准确度(训练数据)准确度(glove.6B.100d)
lstm585.58%77.55%
lstm1088.93%88.76%
lstm1589.08%88.52%
lstm2088.81%88.14%
conv586.68%86.26%
conv1085.62%86.10%
conv1585.82%85.51%
conv2086.44%85.82%
transformer586.32%-
transformer1084.52%-

lstm.py

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

num_epochs = 15
batch_size = 50
learning_rate = 0.001

max_sequence = 400
embedding_dim = 100
hidden_dim = 256
num_layers = 2
output_dim = 2

train_pipe, test_pipe = datasets.IMDB()

tokenizer = get_tokenizer('basic_english')

vocab = build_vocab_from_iterator([tokenizer(text) for label, text in train_pipe], min_freq=2,
                                  specials=["<unk>", "<pad>"], special_first=True)
vocab.set_default_index(vocab["<unk>"])
vocab_len = len(vocab)
padding_idx = vocab['<pad>']

class IMDBDataset(Dataset):
    def __init__(self, pipe):
        self.texts = []
        self.labels = []
        for label, text in pipe:
            self.texts.append(text)
            self.labels.append(label)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index] - 1
        tokens = [vocab[token] for token in tokenizer(text)]
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)

train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=padding_idx)
    return texts[:, :max_sequence], torch.tensor(labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class RNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_len, embedding_dim=embedding_dim, padding_idx=padding_idx)
        self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True,
                           dropout=0.5, bidirectional=True)
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        out = self.embedding(x)
        out = self.dropout(out)
        _, (hidden, _) = self.rnn(out)
        out = torch.concat((hidden[-1], hidden[-2]), dim=1)
        out = self.dropout(out)
        return self.fc(out)

model = RNNModel().to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for i, (texts, labels) in enumerate(train_loader):
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, i + 1, len(train_loader),
                                                                     loss.item()))

model.eval()

with torch.no_grad():
    total = 0
    correct = 0
    for texts, labels in test_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        _, prediction = torch.max(output, dim=1)

        total += len(texts)
        correct += (prediction == labels).sum().item()

    print("Accuracy of {} test reviews is {}%".format(len(train_dataset), correct / total * 100))

glove.py

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe

num_epochs = 10
batch_size = 50
learning_rate = 0.001

max_sequence = 400
embedding_dim = 100
hidden_dim = 256
num_layers = 2
output_dim = 2

train_pipe, test_pipe = datasets.IMDB()

tokenizer = get_tokenizer('basic_english')

vocab = GloVe(name="6B", dim=100)
vocab_len = len(vocab)
padding_idx = 0

class IMDBDataset(Dataset):
    def __init__(self, pipe):
        self.texts = []
        self.labels = []
        for label, text in pipe:
            self.texts.append(text)
            self.labels.append(label)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index] - 1
        tokens = [vocab.stoi[token] if token in vocab.stoi else 0 for token in tokenizer(text)]
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)

train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=padding_idx)
    return texts[:, :max_sequence], torch.tensor(labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class RNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_len, embedding_dim=embedding_dim, padding_idx=padding_idx)
        self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True,
                           dropout=0.5, bidirectional=True)
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        out = self.embedding(x)
        out = self.dropout(out)
        _, (hidden, _) = self.rnn(out)
        out = torch.concat((hidden[-1], hidden[-2]), dim=1)
        out = self.dropout(out)
        return self.fc(out)

model = RNNModel().to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for i, (texts, labels) in enumerate(train_loader):
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, i + 1, len(train_loader),
                                                                     loss.item()))

model.eval()

with torch.no_grad():
    total = 0
    correct = 0
    for texts, labels in test_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        _, prediction = torch.max(output, dim=1)

        total += len(texts)
        correct += (prediction == labels).sum().item()

    print("Accuracy of {} test reviews is {}%".format(len(train_dataset), correct / total * 100))

cnn.py

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

num_epochs = 5
batch_size = 50
learning_rate = 0.001

max_sequence = 400
embedding_dim = 100
hidden_dim = 256
num_layers = 2
output_dim = 2

train_pipe, test_pipe = datasets.IMDB()

tokenizer = get_tokenizer('basic_english')

vocab = build_vocab_from_iterator([tokenizer(text) for label, text in train_pipe], min_freq=2,
                                  specials=["<unk>", "<pad>"], special_first=True)
vocab.set_default_index(vocab["<unk>"])
vocab_len = len(vocab)
padding_idx = vocab['<pad>']

class IMDBDataset(Dataset):
    def __init__(self, pipe):
        self.texts = []
        self.labels = []
        for label, text in pipe:
            self.texts.append(text)
            self.labels.append(label)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index] - 1
        tokens = [vocab[token] for token in tokenizer(text)]
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)

train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=padding_idx)
    return texts[:, :max_sequence], torch.tensor(labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=padding_idx)

        self.cnn1 = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        self.cnn2 = nn.Sequential(
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(256 * embedding_dim, output_dim)

    def forward(self, x):
        out = self.embedding(x)
        out = out.permute(0, 2, 1)
        out = self.cnn1(out)
        out = self.cnn2(out)
        out = self.dropout(out)
        out = self.flatten(out)
        return self.fc(out)

model = CNNModel().to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for i, (texts, labels) in enumerate(train_loader):
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, i + 1, len(train_loader),
                                                                     loss.item()))

model.eval()

with torch.no_grad():
    total = 0
    correct = 0
    for texts, labels in test_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        _, prediction = torch.max(output, dim=1)

        total += len(texts)
        correct += (prediction == labels).sum().item()

    print("Accuracy of {} test reviews is {}%".format(len(train_dataset), correct / total * 100))

cnn_glove.py

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe

num_epochs = 5
batch_size = 50
learning_rate = 0.001

max_sequence = 400
embedding_dim = 100
hidden_dim = 256
num_layers = 2
output_dim = 2

train_pipe, test_pipe = datasets.IMDB()

tokenizer = get_tokenizer('basic_english')

vocab = GloVe("6B", dim=300)
vocab_len = len(vocab)

class IMDBDataset(Dataset):
    def __init__(self, pipe):
        self.texts = []
        self.labels = []
        for label, text in pipe:
            self.texts.append(text)
            self.labels.append(label)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index] - 1
        tokens = [vocab.stoi[token] if token in vocab.stoi else 0 for token in tokenizer(text)]
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)

train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return texts[:, :max_sequence], torch.tensor(labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_len, embedding_dim, padding_idx=0)

        self.cnn1 = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        self.cnn2 = nn.Sequential(
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(256 * embedding_dim, output_dim)

    def forward(self, x):
        out = self.embedding(x)
        out = out.permute(0, 2, 1)
        out = self.cnn1(out)
        out = self.cnn2(out)
        out = self.dropout(out)
        out = self.flatten(out)
        return self.fc(out)

model = CNNModel().to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for i, (texts, labels) in enumerate(train_loader):
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(epoch + 1, num_epochs, i + 1, len(train_loader),
                                                                     loss.item()))

model.eval()

with torch.no_grad():
    total = 0
    correct = 0
    for texts, labels in test_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        _, prediction = torch.max(output, dim=1)

        total += len(texts)
        correct += (prediction == labels).sum().item()

    print("Accuracy of {} test reviews is {}%".format(len(train_dataset), correct / total * 100))

transformer.py

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext import datasets
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

num_epochs = 5
batch_size = 50
learning_rate = 0.001

max_sequence = 400
embedding_dim = 96
hidden_dim = 256
num_layers = 2
output_dim = 2
nhead = 8

train_pipe, test_pipe = datasets.IMDB()

tokenizer = get_tokenizer("basic_english")

vocab = build_vocab_from_iterator(
    [tokenizer(text) for label, text in train_pipe],
    min_freq=2,
    specials=["<unk>", "<pad>"],
    special_first=True,
)
vocab.set_default_index(vocab["<unk>"])
vocab_len = len(vocab)
padding_idx = vocab["<pad>"]

class IMDBDataset(Dataset):
    def __init__(self, pipe):
        self.texts = []
        self.labels = []
        for label, text in pipe:
            self.texts.append(text)
            self.labels.append(label)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index] - 1
        tokens = [vocab[token] for token in tokenizer(text)]
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(
            label, dtype=torch.long
        )

train_dataset = IMDBDataset(train_pipe)
test_dataset = IMDBDataset(test_pipe)

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=padding_idx)
    return texts[:, :max_sequence], torch.tensor(labels)

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TransformerModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_len,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx,
        )
        self.positional_encoding = nn.Parameter(
            torch.zeros(1, max_sequence, embedding_dim)
        )
        self.transformer = nn.Transformer(
            d_model=embedding_dim,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=hidden_dim,
            dropout=0.5,
            batch_first=True,
        )
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        seq_length = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_length, :]
        output = self.transformer(x, x)
        output = output.mean(dim=1)
        return self.fc(output)

model = TransformerModel().to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for i, (texts, labels) in enumerate(train_loader):
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(
                "Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}".format(
                    epoch + 1, num_epochs, i + 1, len(train_loader), loss.item()
                )
            )

model.eval()

with torch.no_grad():
    total = 0
    correct = 0
    for texts, labels in test_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts)
        _, prediction = torch.max(output, dim=1)

        total += len(texts)
        correct += (prediction == labels).sum().item()

    print(
        "Accuracy of {} test reviews is {}%".format(
            len(train_dataset), correct / total * 100
        )
    )