Python: Generate Ngrams and Nterms

2 min read
[1] Generate Ngrams
from itertools import islice
def generate_ngrams(text, ngram_size=(2, 2), min_word_size=3, return_tuple=True):
"""
Generate strict n-grams (adjacent word sequences) from the input text.
Args:
text (str): The input text to process.
ngram_size (tuple, optional): Range of n-gram sizes to generate (min_n, max_n). Defaults to (2, 2).
min_word_size (int, optional): Minimum word length to include in n-grams. Defaults to 3.
return_tuple (bool, optional): Whether to return n-grams as tuples. If False, returns joined strings. Defaults to True.
Returns:
list: A list of n-gram tuples or joined strings.
"""
# Step 1: Tokenize the text into words
tokens = text.split()
# Step 2: Filter out short words based on min_word_size
filtered_tokens = [word for word in tokens if len(word) >= min_word_size]
# Step 3: Generate n-grams for each size in the range [min_n, max_n]
min_n, max_n = ngram_size
ngrams = []
for n in range(min_n, max_n + 1):
# Use a sliding window to generate n-grams
ngrams.extend([filtered_tokens[i:i + n] for i in range(len(filtered_tokens) - n + 1)])
# Step 4: Format the output
if not return_tuple:
ngrams = ["_".join(ngram) for ngram in ngrams]
return ngrams
[2] Generate Nterms
from itertools import combinations
def generate_nterms(text, ngram_size=(2, 2), min_word_size=3, return_tuple=True):
"""
Generate n-gram combinations from the input text.
Args:
text (str): The input text to process.
ngram_size (tuple, optional): Range of n-gram sizes to generate (min_n, max_n). Defaults to (2, 2).
min_word_size (int, optional): Minimum word length to include in n-grams. Defaults to 3.
return_tuple (bool, optional): Whether to return n-grams as tuples. If False, returns joined strings. Defaults to True.
Returns:
list: A list of n-gram tuples or joined strings.
"""
# Step 1: Tokenize the text into words
tokens = text.split()
# Step 2: Filter out short words based on min_word_size
filtered_tokens = [word for word in tokens if len(word) >= min_word_size]
# Step 3: Generate n-grams for each size in the range [min_n, max_n]
min_n, max_n = ngram_size
ngrams = []
for n in range(min_n, max_n + 1):
ngrams.extend(list(combinations(filtered_tokens, n)))
# Step 4: Format the output
if not return_tuple:
ngrams = ["_".join(ngram) for ngram in ngrams]
return ngrams
# Example usage
text = "i literally have not been to sleep because i cannot believe this is happening"
ngrams = generate_terms(text, ngram_size=(1, 3), min_word_size=3, return_tuple=False)
print(ngrams)
0
Subscribe to my newsletter
Read articles from Mohamad Mahmood directly inside your inbox. Subscribe to the newsletter, and don't miss out.
Written by

Mohamad Mahmood
Mohamad Mahmood
Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He studies at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).