# encoding: utf-8

from saldo import SALDO

from collections import defaultdict
import codecs
import re

class Splitter:
    def __init__(self, saldo, min_seg=3, max_parts=4):
        self.saldo = saldo
        self.min_seg = min_seg
        self.max_parts = max_parts
        self.prefix_c = defaultdict(set)
        self.prefix_ci = defaultdict(set)
        self.prefix_cm = defaultdict(set)
        self.suffix = defaultdict(lambda: defaultdict(set))
        self.prefix_searched = set()
        self.suffix_searched = set()


    @staticmethod
    def combine(found, ends_at, j, history, depth):
        if depth <= 0: return
        for i,_,seg in ends_at[j]:
            if i == 0:
                found.append([seg]+history)
            else:
                Splitter.combine(found, ends_at, i, [seg]+history, depth-1)


    def _has_prefix(self, prefix, initial):
        if prefix in self.prefix_searched:
            if prefix in self.prefix_c: return True
            if initial and (prefix in self.prefix_ci): return True
            if (not initial) and (prefix in self.prefix_cm): return True
        found = False
        for lemgram_id, pos, msd in \
        self.saldo.db_get_lemgrams_pos_msd_by_form(prefix):
            tags = msd.split()
            if 'c' in tags:
                self.prefix_c[prefix].add(lemgram_id)
                found = True
            elif 'ci' in tags:
                self.prefix_ci[prefix].add(lemgram_id)
                if initial: found = True
            elif 'cm' in tags:
                self.prefix_cm[prefix].add(lemgram_id)
                if not initial: found = True
        self.prefix_searched.add(prefix)
        return found


    POS_MAP = { 'nn': 'NN', 'vb': 'VB', 'av': 'JJ', 'ab': 'AB' }

    def _has_suffix(self, suffix, pos):
        if suffix in self.suffix_searched:
            if suffix in self.suffix[pos]: return True

        found = False
        for lemgram_id,_,saldo_pos,_ in self.saldo.get_lemgrams_by_gf(suffix):
            suc_pos = Splitter.POS_MAP.get(saldo_pos)
            if suc_pos is None: continue
            self.suffix[suc_pos][suffix].add(lemgram_id)
            if suc_pos == pos: found = True
        self.suffix_searched.add(suffix)
        return found


    def is_semantic_compound(self, word, segs, pos):
        if not self._has_suffix(word, pos): return None

        def get_ancestors(lemgram_ids):
            senses = set()
            for lemgram_id in lemgram_ids:
                if lemgram_id is None: continue
                senses |= set([
                    sense[0] for sense in
                    self.saldo.get_senses_by_lemgram(lemgram_id)])
            ancestors = set()
            for sense_id in senses:
                ancestors |= set(self.saldo.get_ancestors(sense_id, 2))
            return ancestors | senses

        word_ancestors = get_ancestors(
            [lemgram[0] for lemgram in self.saldo.get_lemgrams_by_form(word)])

        # TODO: make this more efficient by checking as we go
        seg_ancestors = get_ancestors(
            self.prefix_c.get(segs[0], set()) |
            self.prefix_ci.get(segs[0], set()))
        for middle in segs[1:-1]:
            seg_ancestors |= get_ancestors(
                self.prefix_c.get(middle, set()) |
                self.prefix_cm.get(middle, set()))
        seg_ancestors |= get_ancestors(
            self.suffix[pos].get(segs[-1], set()))

        #print self.prefix_c.get(segs[0], set())
        #print self.prefix_ci.get(segs[0], set())
        #print self.suffix[pos].get(segs[-1], set())
        #print 'ancestors of ', word.encode('utf-8')
        #print seg_ancestors, word_ancestors

        return len(seg_ancestors & word_ancestors) > 0


    def analyze(self, word, pos, initial=True):
        ends_at = [[] for _ in range(len(word)+1)]
        len_at = [None] * (len(word)+1)
        len_at[0] = 0
        # Try to find candidates for the initial and middle segments
        for j in range(self.min_seg, len(word)-self.min_seg+1):
            # First try the initial segment (the word itself is
            # compound-initial, as indicated by _initial_, otherwise this is
            # a middle segment)
            seg = word[:j]
            if self._has_prefix(seg, initial):
                ends_at[j].append((0, j, seg))
                len_at[j] = 1
            # Then try segments starting at position i
            for i in range(self.min_seg, j-self.min_seg+1):
                if ends_at[i] == []: continue
                seg = word[i:j]
                if self._has_prefix(seg, False):
                    ends_at[j].append((i, j, seg))
                    if len_at[j] is None or len_at[j] > 1+len_at[i]:
                        len_at[j] = 1+len_at[i]
                # Does this segment start with a doubled letter?
                if not (word[i-2] == word[i-1]): continue
                # If so, do the same thing as above, except 
                seg = word[i-1] + seg
                if self._has_prefix(seg, False):
                    ends_at[j].append((i, j, seg))
                    if len_at[j] is None or len_at[j] > 1+len_at[i]:
                        len_at[j] = 1+len_at[i]
        # Try to find candidates for the final segment
        j = len(word)
        for i in range(self.min_seg, len(word)-self.min_seg+1):
            if ends_at[i] == []: continue
            seg = word[i:]
            if self._has_suffix(seg, pos):
                ends_at[j].append((i, j, seg))
                if len_at[j] is None or len_at[j] > 1+len_at[i]:
                    len_at[j] = 1+len_at[i]
            # Does this segment start with a doubled letter?
            if not (word[i-2] == word[i-1]): continue
            seg = word[i-1] + seg
            # If so, do the same thing as above, except 
            if self._has_suffix(seg, pos):
                ends_at[j].append((i, j, seg))
                if len_at[j] is None or len_at[j] > 1+len_at[i]:
                    len_at[j] = 1+len_at[i]
        if ends_at[j] == []: return None
        if len_at[j] > 4: return None
        found = []
        Splitter.combine(found, ends_at, j, [], len_at[j])
        return found


    RE_WORD = re.compile(
        r'([a-zA-ZåäöéüÅÄÖÉÜ0-9]+-)*[a-zA-ZåäöéüÅÄÖÉÜ]+(:[a-zA-Z]+)?$')
    RE_DASH = re.compile(r'-+')

    def split(self, word, pos):
        is_not_word = Splitter.RE_WORD.match(word) is None
        parts = Splitter.RE_DASH.split(word)
        if len(parts) >= self.max_parts: return None
        elif len(parts) >= 2:
            if self._has_suffix(parts[-1], pos):
                if "" in parts: return None
                return [parts]
            if is_not_word: return None
            segs = self.analyze(parts[-1], pos, False)
            if segs is None: return [parts]
            else: return [parts[:-1] + seg for seg in segs]
        elif len(parts) == 1:
            if is_not_word: return None
            return self.analyze(word, pos)
        else: return None


    def import_prefixes(self, prefixes):
        for prefix in prefixes:
            if len(prefix) < self.min_seg: continue
            prefix = prefix.lower()
            self.prefix_c[prefix].add(None)


if __name__ == '__main__':
    import sys, sqlite3, os.path
    word = str(sys.argv[1], 'utf-8')
    pos = sys.argv[2]
    assert pos in ['NN','PM','VB','JJ','AB']

    saldo = SALDO()
    splitter = Splitter(saldo)
    splits = splitter.split(word, pos)
    print(splits)
    if not splits is None:
        for segs in splits:
            print(splitter.is_semantic_compound(word, segs, pos), segs)