Transformers Pipeline: Translate English to French

2 min read
[1] Get dataset
import pandas as pd
# Load dataset
dset_url = 'https://archive.org/download/misc-dataset/airline-tweets.csv'
df_airline = pd.read_csv(dset_url)
# Display information about the dataset
df_airline.info()
[2] Clean
import pandas as pd
# !pip install neattext
import neattext.functions as nfx
# Clean the text data
df_airline['clean'] = df_airline['text'] \
.apply(nfx.remove_userhandles) \
.apply(nfx.remove_numbers) \
.apply(nfx.fix_contractions) \
.apply(nfx.remove_punctuations) \
.apply(nfx.remove_hashtags) \
.apply(nfx.remove_urls) \
.apply(str.lower)
[3] Translate
(approx 30 min with CUDA)
from transformers import pipeline
# Load the translation pipeline with CUDA (GPU)
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", device=0) # device=0 for the first GPU
# Function to translate text
def translate_to_french(text):
try:
# Ensure the input text is not empty
if not text or not text.strip():
return "" # Return an empty string for empty or invalid input
# Translate the text
translated = translator(text, max_length=50) # Adjust max_length as needed
# Check if the translation result is not empty
if translated and isinstance(translated, list) and len(translated) > 0:
return translated[0]['translation_text']
else:
return "" # Return an empty string if translation fails
except Exception as e:
print(f"Error translating text: {text}. Error: {e}")
return "" # Return an empty string in case of any error
# Example usage with a DataFrame
df_airline['clean_french'] = df_airline['clean'].apply(translate_to_french)
# Display the DataFrame
display(df_airline)
[4] Preview
display(df_airline[['phrase','phrase_french']].head())
[5] Save
df_airline.to_csv("airline-tweets-clean-french.csv", index=False, header=True)
.
0
Subscribe to my newsletter
Read articles from Mohamad Mahmood directly inside your inbox. Subscribe to the newsletter, and don't miss out.
Written by

Mohamad Mahmood
Mohamad Mahmood
Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He studies at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).