Python: Generate Ngrams and Nterms

Mohamad MahmoodMohamad Mahmood
2 min read

[1] Generate Ngrams

from itertools import islice

def generate_ngrams(text, ngram_size=(2, 2), min_word_size=3, return_tuple=True):
    """
    Generate strict n-grams (adjacent word sequences) from the input text.

    Args:
        text (str): The input text to process.
        ngram_size (tuple, optional): Range of n-gram sizes to generate (min_n, max_n). Defaults to (2, 2).
        min_word_size (int, optional): Minimum word length to include in n-grams. Defaults to 3.
        return_tuple (bool, optional): Whether to return n-grams as tuples. If False, returns joined strings. Defaults to True.

    Returns:
        list: A list of n-gram tuples or joined strings.
    """
    # Step 1: Tokenize the text into words
    tokens = text.split()

    # Step 2: Filter out short words based on min_word_size
    filtered_tokens = [word for word in tokens if len(word) >= min_word_size]

    # Step 3: Generate n-grams for each size in the range [min_n, max_n]
    min_n, max_n = ngram_size
    ngrams = []
    for n in range(min_n, max_n + 1):
        # Use a sliding window to generate n-grams
        ngrams.extend([filtered_tokens[i:i + n] for i in range(len(filtered_tokens) - n + 1)])

    # Step 4: Format the output
    if not return_tuple:
        ngrams = ["_".join(ngram) for ngram in ngrams]

    return ngrams

[2] Generate Nterms

from itertools import combinations

def generate_nterms(text, ngram_size=(2, 2), min_word_size=3, return_tuple=True):
    """
    Generate n-gram combinations from the input text.

    Args:
        text (str): The input text to process.
        ngram_size (tuple, optional): Range of n-gram sizes to generate (min_n, max_n). Defaults to (2, 2).
        min_word_size (int, optional): Minimum word length to include in n-grams. Defaults to 3.
        return_tuple (bool, optional): Whether to return n-grams as tuples. If False, returns joined strings. Defaults to True.

    Returns:
        list: A list of n-gram tuples or joined strings.
    """
    # Step 1: Tokenize the text into words
    tokens = text.split()

    # Step 2: Filter out short words based on min_word_size
    filtered_tokens = [word for word in tokens if len(word) >= min_word_size]

    # Step 3: Generate n-grams for each size in the range [min_n, max_n]
    min_n, max_n = ngram_size
    ngrams = []
    for n in range(min_n, max_n + 1):
        ngrams.extend(list(combinations(filtered_tokens, n)))

    # Step 4: Format the output
    if not return_tuple:
        ngrams = ["_".join(ngram) for ngram in ngrams]

    return ngrams

# Example usage
text = "i literally have not been to sleep because i cannot believe this is happening"
ngrams = generate_terms(text, ngram_size=(1, 3), min_word_size=3, return_tuple=False)
print(ngrams)
0
Subscribe to my newsletter

Read articles from Mohamad Mahmood directly inside your inbox. Subscribe to the newsletter, and don't miss out.

Written by

Mohamad Mahmood
Mohamad Mahmood

Mohamad's interest is in Programming (Mobile, Web, Database and Machine Learning). He studies at the Center For Artificial Intelligence Technology (CAIT), Universiti Kebangsaan Malaysia (UKM).