Pos using Seq to seq
# Imports
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# Example Data (Replace with your dataset)
sentences = ["I love programming", "You are learning", "We enjoy coding"]
tags = [["PRON", "VERB", "NOUN"], ["PRON", "VERB", "VERB"], ["PRON", "VERB", "NOUN"]]
# Preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
input_sequences = tokenizer.texts_to_sequences(sentences)
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(tags)
tag_index = tag_tokenizer.word_index
output_sequences = tag_tokenizer.texts_to_sequences(tags)
# Padding
max_seq_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, padding='post', maxlen=max_seq_length)
output_sequences = pad_sequences(output_sequences, padding='post', maxlen=max_seq_length)
# One-hot encoding for tags
output_sequences = np.array([
np.eye(len(tag_index) + 1)[seq] for seq in output_sequences
])
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
input_sequences, output_sequences, test_size=0.2, random_state=42
)
# Model Definition
input_layer = Input(shape=(max_seq_length,))
embedding_dim = 50
vocab_size = len(word_index) + 1
tag_vocab_size = len(tag_index) + 1
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
encoder_lstm = LSTM(units=128, return_sequences=True)(embedding)
output_layer = Dense(units=tag_vocab_size, activation='softmax')(encoder_lstm)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Training
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))
# Testing Prediction
sample_sentence = ["I enjoy learning"]
sample_sequence = tokenizer.texts_to_sequences(sample_sentence)
sample_padded = pad_sequences(sample_sequence, padding='post', maxlen=max_seq_length)
predictions = model.predict(sample_padded)
# Decode Predictions
predicted_tags = [
[list(tag_index.keys())[np.argmax(tag)] for tag in sequence]
for sequence in predictions
]
print("Sample Sentence:", sample_sentence)
print("Predicted Tags:", predicted_tags)
Comments
Post a Comment