#-*- coding: utf-8 -*-

'''
Created on Oct 21, 2016

Pattern identification methods for Swedish words

@author: David
'''
from nltk import ngrams as nltkngrams
from nltk.stem.snowball import SwedishStemmer

def ngrams(word,n):
    '''
    Returns all n-grams for a given word, given the word and n
    '''
    return [ngram for ngram in nltkngrams(word, n)]

def padded_ngrams(word, n):
    pword = "#" + word + "$"
    return ngrams(pword,n)

def suffixes(word):
    '''
    Returns the suffix (if any) of a word by applying the snowball stemmer for Swedish
    to the word and returning the difference between the original word and the calculated stem
    '''
    stemmer = SwedishStemmer()
    
    word = word.decode('unicode-escape')
    stem = stemmer.stem(word)
    return word[len(stem):]

if __name__ == '__main__':
    print(ngrams("trigrams", 4))
    print(suffixes("vatten"))