# -*- coding: utf-8 -*- import numpy as np from sklearn import svm from sklearn import neighbors from sklearn import tree from scipy import sparse import sys import re class svmstringclassify: def __init__(self, anchored = True, maxlen = 7, prefix = False, suffix = True, wickel = False, type = 'svm_linear', Cvalue = 5.0): self.substringmap = {} self.clf = None self.anchored = anchored self.maxlen = maxlen self.prefix = prefix self.suffix = suffix self.wickel = wickel self.type = type self.Cvalue = Cvalue def _getsubstrings(self, word): """Returns desired substrings of maximum length maxlen from input string. The output depends on prefix (use prefixes), suffix (use suffixes), anchored (only take substrings from word edge), and wickel (only use wickelfeatures).""" word = '#' + word + '#' if self.wickel: return set([word[i:i+3] for i in range(len(word)-2)]) if self.anchored: substrings = set() if self.prefix: substrings |= set([word[0:j] for j in xrange(2,min((self.maxlen+2), len(word)+1))]) if self.suffix: substrings |= set([word[j:len(word)+1] for j in xrange(max(0,len(word)-self.maxlen-1),len(word)-1)]) return set(substrings) substrings = set([word[i:j] for i in range(len(word)) for j in range(i+1,len(word)+1)]) return substrings def train(self, wordsclasses): """Takes list of 2-tuples [(class, word), ... , (class,word)] and trains classifier.""" words = [w[1] for w in wordsclasses] substrset = set() for w in words: substrset |= self._getsubstrings(w) self.substringmap = dict(zip(substrset, xrange(len(substrset)))) self._trainsvn(wordsclasses, type) def callclassifier(self, word): """Classifies a word with the currently trained model.""" feat = self._wordtofeat(word) if self.type == 'svm_linear': return self.clf.decision_function([feat]) # for Linear elif self.type == 'svm': return self.clf.predict_proba([feat]) elif self.type == 'k-neighbors': return self.clf.predict_proba([feat]) # for Linear def _trainsvn(self, wordsclasses, type): """Extract substring (or wickel) features and calls SVM training routines.""" featmatrix = sparse.lil_matrix((len(wordsclasses), len(self.substringmap)), dtype = int) for sampleno, wc in enumerate(wordsclasses): substringfeatures = self._getsubstrings(wc[1]) for f in substringfeatures: if f in self.substringmap: featmatrix[sampleno, self.substringmap[f]] = 1 self.features = featmatrix.tocsr() self.classes = [w[0] for w in wordsclasses] if self.type == 'svm_linear': self.clf = svm.LinearSVC(C=self.Cvalue) elif self.type == 'svm': self.clf = svm.SVC(C=self.Cvalue, probability=True) elif self.type == 'k-neighbors': self.clf = neighbors.KNeighborsClassifier(n_neighbors = 2) self.clf.fit(self.features, self.classes) def _wordtofeat(self, word): """Extracts substring features from a word into binary array.""" substringfeatures = self._getsubstrings(word) #feats = [0]*len(self.substringmap) feats = np.zeros(len(self.substringmap), int) for f in substringfeatures: if f in self.substringmap: feats[self.substringmap[f]] = 1 return feats class Classifier: def __init__(self): self.paradigms = None self.corpus = None self.idxtoword = {} # We index paradigms by number internally self.wordtoidx = {} # These two map the key 0-form to an int and vice versa self.config = './classifier_svm.conf' self.out=sys.stderr self.thislang = '' def train(self, paradigms, corpus,config=''): """Trains a svm with word features, given paradigms.""" self.paradigms = paradigms self.corpus = corpus trainingdata = [] index = 0 for pid, par in self.paradigms.paradigms.iteritems(): self.idxtoword[index] = pid # Paradigm id-word self.wordtoidx[pid] = index # Paradigm index index += 1 for (_,inst) in par.instances: zeroform = inst['0'] # the 0-form in the paradigm trainingdata.append((index, zeroform)) # 2-tuples of (index, 0-form) if not self.thislang: self.thislang = re.sub(r'^([^-_]+_[^-_]+).*$', r'\1', self.paradigms.paradigmfilename.split('/')[0]) try: import os.path if os.path.isfile(self.config): conffile = open(self.config) else: conffile = open('src/'+self.config) conffile = conffile.readlines() thisconf= [line.strip() for line in conffile if '#' not in line and self.thislang in line] if len(thisconf)>0: (fname,maxlen,prefix,suffix,C) = thisconf[0].split('\t') self.svm = svmstringclassify(maxlen = int(maxlen), prefix = int(prefix), suffix = int(suffix), Cvalue = float(C)) else: print >> self.out, "No setting found for %s in configuration file, using defaults" % self.thislang print >> self.out, conffile self.svm = svmstringclassify() except IOError: print "No configuration file found, using defaults" self.svm = svmstringclassify() self.svm.train(trainingdata) def classify(self, word): """Returns list of probabilities of belonging to each class.""" validcandidates = set() for (n,p) in self.paradigms.paradigms.iteritems(): # Check which candidates match the first pattern r = p.match_patterns(word, True) if p.match_patterns_light(word, only_baseform = True, multiple = False): validcandidates.add(self.wordtoidx[n]) classes = self.svm.callclassifier(word)[0] # list of scores cindices = sorted(range(len(classes)), key=lambda k: classes[k], reverse = True) # sort and get indices to 0-form cfiltered = [(self.idxtoword[c], classes[c]) for c in cindices if c in validcandidates] return cfiltered