# -*- coding: utf-8 -*-
import numpy as np
from sklearn import svm
from sklearn import neighbors
from sklearn import tree
from scipy import sparse
import sys
import re

class svmstringclassify:

    def __init__(self, anchored = True, maxlen = 7, prefix = False, suffix = True, wickel = False, type = 'svm_linear', Cvalue = 5.0):
        self.substringmap = {}
        self.clf = None
        self.anchored = anchored
        self.maxlen = maxlen
        self.prefix = prefix
        self.suffix = suffix
        self.wickel = wickel
        self.type = type
        self.Cvalue = Cvalue

    def _getsubstrings(self, word):
        """Returns desired substrings of maximum length maxlen from input string.
           The output depends on prefix (use prefixes), suffix (use suffixes),
           anchored (only take substrings from word edge), and wickel (only use wickelfeatures)."""
        word = '#' + word + '#'
        if self.wickel:
            return set([word[i:i+3] for i in range(len(word)-2)])
        if self.anchored:
            substrings = set()
            if self.prefix:
                substrings |= set([word[0:j] for j in xrange(2,min((self.maxlen+2), len(word)+1))])
            if self.suffix:
                substrings |= set([word[j:len(word)+1] for j in xrange(max(0,len(word)-self.maxlen-1),len(word)-1)])
            return set(substrings)
        substrings = set([word[i:j] for i in range(len(word)) for j in range(i+1,len(word)+1)])
        return substrings

    def train(self, wordsclasses):
        """Takes list of 2-tuples [(class, word), ... , (class,word)]
           and trains classifier."""
        words = [w[1] for w in wordsclasses]
        substrset = set()
        for w in words:
            substrset |= self._getsubstrings(w)
        self.substringmap = dict(zip(substrset, xrange(len(substrset))))
        self._trainsvn(wordsclasses, type)

    def callclassifier(self, word):
        """Classifies a word with the currently trained model."""
        feat = self._wordtofeat(word)
        if self.type == 'svm_linear':
            return self.clf.decision_function([feat])  # for Linear
        elif self.type == 'svm':
            return self.clf.predict_proba([feat])
        elif self.type == 'k-neighbors':
            return self.clf.predict_proba([feat])  # for Linear
            
    def _trainsvn(self, wordsclasses, type):
        """Extract substring (or wickel) features and calls SVM training routines."""
        featmatrix = sparse.lil_matrix((len(wordsclasses), len(self.substringmap)), dtype = int)
        for sampleno, wc in enumerate(wordsclasses):
            substringfeatures = self._getsubstrings(wc[1])
            for f in substringfeatures:
                if f in self.substringmap:
                    featmatrix[sampleno, self.substringmap[f]] = 1
        self.features = featmatrix.tocsr()
        self.classes = [w[0] for w in wordsclasses]
        if self.type == 'svm_linear':
            self.clf = svm.LinearSVC(C=self.Cvalue)            
        elif self.type == 'svm':
            self.clf = svm.SVC(C=self.Cvalue, probability=True)
        elif self.type == 'k-neighbors':
            self.clf = neighbors.KNeighborsClassifier(n_neighbors = 2)            
        self.clf.fit(self.features, self.classes)

    def _wordtofeat(self, word):
        """Extracts substring features from a word into binary array."""
        substringfeatures = self._getsubstrings(word)
        #feats = [0]*len(self.substringmap)
        feats = np.zeros(len(self.substringmap), int)
        for f in substringfeatures:
            if f in self.substringmap:
                feats[self.substringmap[f]] = 1
        return feats

class Classifier:

    def __init__(self):
        self.paradigms = None
        self.corpus    = None
        self.idxtoword = {}    # We index paradigms by number internally
        self.wordtoidx = {}    # These two map the key 0-form to an int and vice versa
        self.config = './classifier_svm.conf'
        self.out=sys.stderr
        self.thislang = ''

    def train(self, paradigms, corpus,config=''):
        """Trains a svm with word features, given paradigms."""
        self.paradigms = paradigms
        self.corpus    = corpus
        trainingdata = []
        index = 0
        for pid, par in self.paradigms.paradigms.iteritems():
            self.idxtoword[index] = pid   # Paradigm id-word
            self.wordtoidx[pid] = index   # Paradigm index
            index += 1
            for (_,inst) in par.instances:
                zeroform = inst['0']                   # the 0-form in the paradigm
                trainingdata.append((index, zeroform)) # 2-tuples of (index, 0-form)

        if not self.thislang:
            self.thislang = re.sub(r'^([^-_]+_[^-_]+).*$', r'\1', self.paradigms.paradigmfilename.split('/')[0])
        try:
            import os.path
            if os.path.isfile(self.config):
                conffile = open(self.config)
            else:
                conffile = open('src/'+self.config)
            conffile = conffile.readlines()
            thisconf= [line.strip() for line in conffile if '#' not in line and self.thislang in line]
            if len(thisconf)>0:
                (fname,maxlen,prefix,suffix,C) = thisconf[0].split('\t')
                self.svm = svmstringclassify(maxlen = int(maxlen), prefix = int(prefix), suffix = int(suffix), Cvalue = float(C))
            else:
                print >> self.out, "No setting found for %s in configuration file, using defaults" %  self.thislang
                print >> self.out, conffile
                self.svm = svmstringclassify()
        except IOError:
            print "No configuration file found, using defaults"
            self.svm = svmstringclassify()
        self.svm.train(trainingdata)

    def classify(self, word):
        """Returns list of probabilities of belonging to each class."""
        validcandidates = set()
        for (n,p) in self.paradigms.paradigms.iteritems(): # Check which candidates match the first pattern
            r = p.match_patterns(word, True)
            if p.match_patterns_light(word, only_baseform = True, multiple = False):
                validcandidates.add(self.wordtoidx[n])
        classes = self.svm.callclassifier(word)[0] # list of scores
        cindices = sorted(range(len(classes)), key=lambda k: classes[k], reverse = True) # sort and get indices to 0-form
        cfiltered = [(self.idxtoword[c], classes[c]) for c in cindices if c in validcandidates]
        return cfiltered