#-*- coding: utf-8 -*- ''' Created on Oct 21, 2016 Pattern identification methods for Swedish words @author: David ''' from nltk import ngrams as nltkngrams from nltk.stem.snowball import SwedishStemmer def ngrams(word,n): ''' Returns all n-grams for a given word, given the word and n ''' return [ngram for ngram in nltkngrams(word, n)] def padded_ngrams(word, n): pword = "#" + word + "$" return ngrams(pword,n) def suffixes(word): ''' Returns the suffix (if any) of a word by applying the snowball stemmer for Swedish to the word and returning the difference between the original word and the calculated stem ''' stemmer = SwedishStemmer() word = word.decode('unicode-escape') stem = stemmer.stem(word) return word[len(stem):] if __name__ == '__main__': print(ngrams("trigrams", 4)) print(suffixes("vatten"))