Transformers Pipeline: Translate English Phrase to French

2 min read
[1] Get dataset
import pandas as pd
# Load dataset
dset_url = 'https://archive.org/download/misc-dataset/airline-tweets.csv'
df_airline = pd.read_csv(dset_url)
# Display information about the dataset
df_airline.info()
[2] Clean
import pandas as pd
# !pip install neattext
import neattext.functions as nfx
# Clean the text data
df_airline['clean'] = df_airline['text'] \
.apply(nfx.remove_userhandles) \
.apply(nfx.remove_numbers) \
.apply(nfx.fix_contractions) \
.apply(nfx.remove_punctuations) \
.apply(nfx.remove_hashtags) \
.apply(nfx.remove_urls) \
.apply(str.lower)
define phrase extraction task
# !pip install rake-nltk
from rake_nltk import Rake
def rake_extract_phrases(input_text):
# Initialize RAKE
rake = Rake()
# Extract keywords and phrases from the input text
rake.extract_keywords_from_text(input_text)
# Get the ranked phrases with their scores
ranked_phrases_with_scores = rake.get_ranked_phrases_with_scores()
# Sort the phrases by score in descending order
sorted_phrases = sorted(ranked_phrases_with_scores, key=lambda x: x[0], reverse=True)
return sorted_phrases
# Example usage
input_string = "thank you we got on a different flight."
result = rake_extract_phrases(input_string)
print(result)
extract phrase
df_airline['phrase_score']=df_airline['clean'].apply(rake_extract_phrases)
df_airline['phrase']=[[item[1] for item in list_item] for list_item in df_airline['phrase_score']]
display(df_airline_phrase.head())
explode phrases into separate documents
df_airline_phrase = df_airline.explode('phrase').reset_index(drop=True)[['phrase', 'airline_sentiment', 'negativereason']]
display(df_airline_phrase)
select phrase documents
df_airline_phrase_selected=df_airline_phrase[
(df_airline_phrase['airline_sentiment'] == 'negative') &
(df_airline_phrase['negativereason'].notna()) & # Use `notna()` to filter out NaN values
(df_airline_phrase['negativereason'] == 'Customer Service Issue') &
(df_airline_phrase['phrase'].str.split().str.len() >1) # Filter phrases with 2 or more words
]
print(len(df_airline_phrase_selected))
df_airline_phrase_selected= df_airline_phrase_selected.dropna(subset=['phrase'])
print(len(df_airline_phrase_selected))
display(df_airline_phrase_selected.head())
[3] Translate
(approx 30 min with CUDA)
from transformers import pipeline
# Load the translation pipeline with CUDA (GPU)
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", device=0) # device=0 for the first GPU
# Function to translate text
def translate_to_french(text):
try:
# Ensure the input text is not empty
if not text or not text.strip():
return "" # Return an empty string for empty or invalid input
# Translate the text
translated = translator(text, max_length=50) # Adjust max_length as needed
# Check if the translation result is not empty
if translated and isinstance(translated, list) and len(translated) > 0:
return translated[0]['translation_text']
else:
return "" # Return an empty string if translation fails
except Exception as e:
print(f"Error translating text: {text}. Error: {e}")
return "" # Return an empty string in case of any error
# Example usage with a DataFrame
df_airline_phrase_selected['phrase_french'] = df_airline_phrase_selected['phrase'].apply(translate_to_french)
# Display the DataFrame
display(df_airline_phrase_selected)
[4] Preview
display(df_airline_phrase_selected[['clean','clean_french']].head())
[5] Save
df_airline_phrase_selected.to_csv("airline-tweets-clean-phrase-french.csv", index=False, header=True)
.
0
Subscribe to my newsletter
Read articles from Mohamad Mahmood directly inside your inbox. Subscribe to the newsletter, and don't miss out.
Written by

Mohamad Mahmood
Mohamad Mahmood
Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He studies at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).