#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import re import codecs from collections import defaultdict import os.path """ For reading paradigms from text representation and instantiating them with given words """ class Paradigms: def __init__(self,filename): self.paradigms = {} self.pattern_freq = defaultdict(set) self.paradigmfilename = os.path.basename(filename) print >> sys.stderr, 'open',filename with codecs.open(filename,encoding='utf-8') as f: for l in f: if len(l) > 0: p = Paradigm(l.strip()) self.paradigms[p.id] = p for p in self.paradigms.values(): for patt in p.patterns(): self.pattern_freq[patt].add(p.id) def match_paradigms(self,w, only_baseform=False): return [c for p in self.paradigms for c in p.match_patterns(w,only_baseform)] def paradigm_pattern_freq(self, pid, pids): return [len(self.pattern_freq[patt].intersection(pids)) for patt in self.paradigms[pid].patterns()] def get_table(self, paradigm_name, baseform, multiple=False): return self.paradigms[paradigm_name].instantiate_using_wordform(baseform, True,multiple) class Paradigm: def __init__(self, s): (p, insts) = s.split('\t') self.paradigm = [Pattern(patt.strip()) for patt in p.split('#')] self._instances = [dict([kv.split('=') for kv in inst.split(',')]) for inst in insts.split('#')] self.instances = [(self.instantiate_table(d),d) for d in self._instances] self.id = self.instances[0][1]['0'] def instantiate_table(self,var_values): return [patt.instantiate(var_values) for patt in self.paradigm] def patterns(self): return [patt.raw for patt in self.paradigm] def instantiate_using_wordform(self,w,only_baseform, multiple): return [self.instantiate_table(m['ms']) for m in self.match_patterns(w,only_baseform, multiple)] def match_patterns(self,w,only_baseform=False,multiple=False): var_bindings = [] to_match = self.paradigm if only_baseform: # Hax to only match first table to_match = [Pattern(self.paradigm[0].raw.split(',')[0])] for patt in to_match: for x in patt.match(w,multiple): var_binding = dict(x) if len(var_binding) > 0 and var_binding not in var_bindings: var_bindings.append(var_binding) return [{'w':w,'table':self.instantiate_table(ms),'ms':ms,'p':self} for ms in var_bindings] def match_patterns_light(self,w,only_baseform=False,multiple=False): for patt in [self.paradigm[0]] if only_baseform else self.paradigm: for x in patt.match(w,multiple): if len(x) > 0: return True return False def __eq__(self, other): return self.id == other.id class Pattern: def __init__(self,pattern): self.raw = pattern self.vars = [] # * marks non-existing forms, needs to be escaped pattern = re.sub('\*','\*',pattern) self.regex = self.__create_regex(pattern) def __create_regex(self,pattern): r = '^' for pc in pattern.split('+'): if pc.isdigit(): self.vars.append(pc) r += r'(.+)' else: r += pc.replace('"','') return re.compile(r+'$') def match(self,s,multiple=False): if multiple: findall = self.regex.findall else: findall = self.regex.findall if len(self.vars) == 1: return [zip(self.vars,[x]) for x in findall(s)] else: return [zip(self.vars,x) for x in findall(s)] def instantiate(self,var_values): new = self.raw for (i,v) in var_values.iteritems(): new = new.replace(i,v) return new.replace('+','').replace('"','') def strlen(self): d = dict([(v,'') for v in self.vars]) return len(self.instantiate(d)) def embeddedness(self): # internal affix? return dict([(v,(self.raw.find(v) != 0, self.raw.find(v) != len(self.raw)-1)) for v in self.vars]) def __eq__(self, other): return self.raw == other.raw def __str__(self): return self.raw