gen_ngrams.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
This is a generator of skippy n-grams from a string or a list of segments.
developed by Kow Kuroda (kow.kuroda@gmail.com)
Creation: 2024/06/21
Modified:
2024/06/22: changed argument order to ensure compatibility with ngrams_skippy.py and ngrams.py
2024/06/28: renamed variable 'splitter' to 'sep' for compatibility; implemented max_distance
"""

## imports
import itertools

## functions
def segment (text: str, sep: str, check: bool):
    """
    converts a string to a list of segmenets with the specified separator.
    """
    try:
        L = text.split(sep)
    except ValueError:
        L = list(text)
    ## remove null strings generated by inefficient splitter
    L = [ x for x in L if len(x) > 0 ]
    if check:
        print(f"#L: {L}")
    #
    return L

##
def gen_ngrams (S: list, n: int, sep = " ", as_list = False, check = False):
    """
    takes a list S of segments and returns a list R of n-grams out of them.
    """
    if check:
        print(f"#S: {S}")
    #
    assert n > 0
    if len(S) <= n:
        if as_list:
            return S
        else:
            return [ sep.join(S) ]
    #
    R = [ ]
    for i, x in enumerate(S):
        try:
            y = S[ i:i + n] # get an n-gram
            if len(y) == n: # check its length
                R.append(y)
        except IndexError:
            pass
    ##
    if as_list:
        return R
    else:
        return [ sep.join(x) for x in R ]

##
def gen_skippy_ngrams (S: list, n: int, max_distance = None, sep: str = " ", missing_mark: str = "…", as_list: bool = False, check: bool = False):
    """
    takes a list of segments and returns a list of skippy n-grams out of them
    """
    if check:
        print(f"#S: {S}")
    #
    assert n > 0
    if len(S) <= n:
        if as_list:
            return S
        else:
            return [ sep.join(S) ]
    ## generate target index list
    S_len = len(S)
    R = range(S_len)
    ##P = itertools.combinations(I, r = n) # turned out to be offensive
    ## [ x for x in ...] is necessary as in the following
    ## implementation of restriction by max gap distance
    if max_distance is None: ## max_distance-free
        P = [ x for x in itertools.combinations(R, r = n) if max(x) <= S_len ]
    else: ## max_distance implementation
        Rx = [[ x for x in itertools.combinations(range(i, i + max_distance), n) if max(x) < len(S) ] for i in R ]
        ## flatten U
        P = [ ]
        for rx in Rx:
            P.extend(rx)
    ##
    if check:
        print(f"#P: {P}")
    ## generate substrings
    Q = [ ]
    for p in P:
        q = [ ]
        for j in range(len(p)):
            i = p[j]
            x = S[i]
            if i == 0:
                q.append(x)
                last_i = 0
            else:
                if last_i + 1 == i:
                    q.append(x)
                else:
                    q.append(missing_mark)
                    q.append(x)
                last_i = i
        #
        Q.append(q)
    ## return result
    if as_list: ## result is a list of unstrung lists
        return Q
    else: ## result is a list of strings
        R = [ ]
        for q in Q:
            ## remove the intial missing_mark wrongly generated
            if q[0] == missing_mark:
                R.append(q[1:])
            else:
                R.append(q)
        #
        return ([ sep.join(r) for r in R ])


## aliases
def gen_ngrams_from_str (text: str, n: int, sep = " ", as_list = False, check = False):
    """
    takes a string and returns a list of n-grams out of segments generated using the separator
    """
    S = segment(text, sep, check)
    if check:
        print(f"#S: {S}")
    R = [ ]
    for i, x in enumerate(S):
        try:
            y = S[ i:i + n] # get an n-gram
            if len(y) == n: # check its length
                R.append(y)
        except IndexError:
            pass
    ##
    if as_list:
        return R
    else:
        return [ sep.join(x) for x in R ]

##
def gen_skippy_ngrams_from_str (text: str, n: int, sep: str = " ", missing_mark: str = "…", max_distance = None, as_list: bool = False, check: bool = False):
    """
    takes a string and returns a list of skippy n-grams out of segments generated the using separator
    """
    ## split into segments
    S = segment(text, sep, check)
    if check:
        print(f"#S: {S}")
    ## generate target index list
    I = range(len(S))
    ##
    if max_distance is None:
        ##P = itertools.combinations(I, r = n) # turned out to be offensive
        ## [ x for x in ...] is necessary as in the following
        #P = [ x for x in itertools.combinations(I, r = n) ]
        P = list(itertools.combinations(I, r = n))  # suggested by Pylint
    else:
        Rx = [[ x for x in itertools.combinations(range(i, i + max_distance), n) if max(x) < len(S) ] for i in I ]
        ## flatten U
        P = [ ]
        for rx in Rx:
            P.extend(rx)
    if check:
        print(f"#P: {P}")
    ## generate substrings
    Q = [ ]
    for p in P:
        q = [ ]
        for j in range(len(p)):
            i = p[j]
            x = S[i]
            if i == 0:
                q.append(x)
                last_i = 0
            else:
                if last_i + 1 == i:
                    q.append(x)
                else:
                    q.append(missing_mark)
                    q.append(x)
                last_i = i
        #
        Q.append(q)
    ## return result
    if as_list: ## result is a list of unstrung lists
        return Q
    else: ## result is a list of strings
        R = [ ]
        for q in Q:
            ## remove the intial missing_mark wrongly generated
            if q[0] == missing_mark:
                R.append(q[1:])
            else:
                R.append(q)
        #
        return ([ sep.join(r) for r in R ])

##
def main():
    # test 1
    text1 = "abcdefghij"
    print(f"input: '{text1}'")
    print(gen_skippy_ngrams_from_str (text1, 3, sep = " ", check = False))
    # test 2
    text2 = "abc   def  gh ijk lmn   op"
    print(f"input: '{text2}'")
    print(gen_skippy_ngrams_from_str (text2, 3, sep = " ", as_list = False, check = False))
    # test 3
    text3 = "abde"
    print(f"input: '{text3}'")
    print(gen_skippy_ngrams_from_str (text3, 4, sep = " ", check = False))
    # test 4
    print(f"input: '{text3}'")
    print(gen_skippy_ngrams_from_str (text3, 5, sep = " ", check = False))

    ## test 5
    print(f"input: '{text1}'")
    print(gen_ngrams (text1, 5, sep = "", check = False))
    ## test 6
    print(f"input: '{text2}'")
    print(gen_ngrams (text2, 3, sep ="", check = False))

## tests
if __name__ == "__main__":
    main()


### end of script