import re
import codecs
from dset_proc_aux import *

def is_keyword_within_position(kw, target_edge, proportion):
    #Get target_edge (start/end) and proportion (0.1-0.9) from UI of experiment with parameters
    #eg. "start" 0.2 -> keyword in first 20% of the sentence 
    try:
        position = float(kw.match.start) / kw.sentence.length
    except AttributeError:
         return True
    if target_edge == "start":
        if position <= proportion:
            return True
        else:
            return False
    elif target_edge == "end":
        if position >= 1-proportion:
            return True
        else:
            return False
    else:
        print "Invalid target-edge value"    

def out_of_length_range(sent_len, min_len, max_len):
    #Exclude some frequent puctuation marks from the word count -> maybe not necessary
    #cleaned_sent = re.split('[,.\s]', sent) #improve pattern?
    #words = [w for w in cleaned_sent if len(w) > 0] # char level!
    if min_len <= sent_len <= max_len:
        return False
    else:
        return True

def add_keyword_info(t, stats, params):
    found_kw = False
    if params["query_type"] == "cqp":
        if params["query_w"].split('"')[-2] in t.lemma:
            found_kw = True
    elif params["query_type"] == "wordform" and params["query_w"] == t.word:
            found_kw = True
    elif params["query_type"] == "lemma" and params["query_w"] in t.lemma:
        found_kw = True
    if found_kw:
        stats["keyword"]["word"] = t.word
        stats["keyword"]["lemma"] = t.lemma
        stats["keyword"]["msd"] = t.msd
        stats["keyword"]["pos"] = t.pos
        stats["keyword"]["lex"] = t.lex 
        stats["keyword"]["saldo"] = t.saldo
        stats["keyword"]["ref"] = t.ref
        stats["keyword"]["deprel"] = t.ref 

def split_keyword_context(sent_match):
    try:
        ref = int(sent_match.stats["keyword"]["ref"])
    except KeyError: #AttributeError
        ref = 1 #int(sent_match.stats["keyword"]["ref"])
    sent_match.sent_left = " ".join(sent_match.sent.words.split(" ")[:ref-1]) #sent_match.sent.words.split(" ")[ref:]
    sent_match.sent_right = " ".join(sent_match.sent.words.split(" ")[ref:])
        

def load_list(path_to_list, delimiter="\t"):
    """
    Loads a list from a file and returns a nested list of its lines, the items
    of each line being split along the specified delimiter with the new line
    character removed from the end of each line.
    Args:
        path_to_list: path of the file to open 
        delimiter:  the character used to separate the items on a line
    """
    with codecs.open(path_to_list) as f:
        opened = f.readlines()
    item_list =[]
    for line in opened:
        if line[0] != "#":
            l_lst = line.split(delimiter)
            item_list.append([el.strip("\n") for el in l_lst])
    return item_list