Pos using Seq to seq

 # Imports

import numpy as np

from tensorflow.keras.models import Model

from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split


# Example Data (Replace with your dataset)

sentences = ["I love programming", "You are learning", "We enjoy coding"]

tags = [["PRON", "VERB", "NOUN"], ["PRON", "VERB", "VERB"], ["PRON", "VERB", "NOUN"]]


# Preprocessing

tokenizer = Tokenizer()

tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index

input_sequences = tokenizer.texts_to_sequences(sentences)


tag_tokenizer = Tokenizer()

tag_tokenizer.fit_on_texts(tags)

tag_index = tag_tokenizer.word_index

output_sequences = tag_tokenizer.texts_to_sequences(tags)


# Padding

max_seq_length = max(len(seq) for seq in input_sequences)

input_sequences = pad_sequences(input_sequences, padding='post', maxlen=max_seq_length)

output_sequences = pad_sequences(output_sequences, padding='post', maxlen=max_seq_length)


# One-hot encoding for tags

output_sequences = np.array([

    np.eye(len(tag_index) + 1)[seq] for seq in output_sequences

])


# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(

    input_sequences, output_sequences, test_size=0.2, random_state=42

)


# Model Definition

input_layer = Input(shape=(max_seq_length,))

embedding_dim = 50

vocab_size = len(word_index) + 1

tag_vocab_size = len(tag_index) + 1


embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)

encoder_lstm = LSTM(units=128, return_sequences=True)(embedding)

output_layer = Dense(units=tag_vocab_size, activation='softmax')(encoder_lstm)


model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Training

model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))


# Testing Prediction

sample_sentence = ["I enjoy learning"]

sample_sequence = tokenizer.texts_to_sequences(sample_sentence)

sample_padded = pad_sequences(sample_sequence, padding='post', maxlen=max_seq_length)

predictions = model.predict(sample_padded)


# Decode Predictions

predicted_tags = [

    [list(tag_index.keys())[np.argmax(tag)] for tag in sequence]

    for sequence in predictions

]


print("Sample Sentence:", sample_sentence)

print("Predicted Tags:", predicted_tags)


Comments

Popular posts from this blog