#!/usr/bin/python # -*- coding: utf-8 -*- import cgitb cgitb.enable() import sys, os, cgi, json from HitEx.matching_set import MatchingSet #activate_this = os.path.join(os.path.dirname(__file__), 'venv/bin/activate_this.py') #execfile(activate_this, dict(__file__=activate_this)) def create_mset_json(parameters, criteria): ms = MatchingSet(parameters, criteria) ms.create_set() url = ms.get_url() j = ms.to_json() return (ms, j, url) def main(): """Main CGI handler.""" # Print JSON header print_header() # Convert form fields to regular dictionary form = dict((field, cgi.FieldStorage().getvalue(field)) for field in cgi.FieldStorage().keys()) #form = {"command":"hitex", "query_type":"lemma", "query_w":"hund", "use_defaults":"1"} # for testing parameters = {} criteria = {} default_parameters = { #"query_type": "lemma", # wordform or cqp or lemma #"query_w" : u'bröd', # u"huset", [deprel = "SS" & lemma contains "språk"].decode("utf-8") #"query_pos" : "", # "NN", #"corpus_list":["ROM99","GP2012","LASBART"], #randomly pick one of them? "BLOGGMIX2012" "corpus_list":["rom99","flashback-resor","gp2013","gp2d","attasidor","lasbart","suc3","talbanken"], #"wikipedia-sv" "sweachum","sweacsam" #"corpus_list":["attasidor","lasbart","talbanken","rom99", # "familjeliv-allmanna-fritid","flashback-mat", # "flashback-resor"], "max_kwics": 100, #300, # nr KWICs to process (limited for efficiency reasons) "maxhit": 10, # maximum number of matches to return "target_edge" : "end", # to which edge the keyword should be close to "proportion" : 50, # within which percentage of the sent the keyword should appear "target_cefr" : "B1", # 'any' not supported - omit readability to obtain the same effect "voc_thr" : 0, # percentage of words above the target CEFR level "min_len" : 6, "max_len" : 20, "non_alpha_thr": 30, "non_lemmatized_thr": 30, "lex_to_func_thr": "", #0.8, "sensitive_voc_cats": ["all"], # ["sex", "violence", "other", "religion", "secretion"], "preserve_bad":True} default_criteria = { "well_formedness":{"root":"filter", "sent_tokenization":"filter", "elliptic":"filter", "non_alpha":"filter", "non_lemmatized":"filter"}, "isolability":{"struct_conn":"filter", "yn_answer":"filter", "anaphora-PN":"filter", "anaphora-AB":"filter"}, "readability":"filter", "typicality": "ranker", "sensitive_voc": "filter", "other_criteria":{"length":"filter", "proper_name":"ranker", "repkw":"filter", "kw_position":"", "modal_verb":"", "participle":"", # same as korp "sverb":"", # same as korp "interrogative":"filter", "neg_form":"", "abbrev":"filter", "direct_speech":"filter", "diff_voc_kelly":"filter", "svalex_fr":"", "out_of_svalex":"filter"} } #default_criteria = {"readability":"ranker"} # TO DO: change to {"filters":["readability"], "rankers":[]} # Get input values if form.get("command", "") == "hitex": if "query_type" in form: parameters["query_type"] = form["query_type"] else: json.dumps({"Error":"Missing 'query_type', choose what TYPE of \ term to search for (lemma, wordform or cqp).", "code":400}) if "query_w" in form: parameters["query_w"] = form["query_w"] else: json.dumps({"Error":"Missing 'query_w', choose a what term to \ search for.", "code":400}) #if "query_pos" in form: #when not specified, any POS will be matched # parameters["query_pos"] = form["query_pos"] if "target_cefr" in form: if form["target_cefr"] not in ["A1", "A2", "B1", "B2", "C1"]: json.dumps({"Error":"Argument 'target_cefr' has to be a CEFR \ level (A1, A2, B1, B2 or C1)", "code":400}) if "readability" in form and not form.has_key("target_cefr"): json.dumps({"Error":"To use 'readability' as a criteria, specify \ 'target_cefr' (A1, A2, B1, B2 or C1)", "code":400}) if "random_seed" in form: try: parameters["random_seed"] = int(form["random_seed"]) except ValueError: json.dumps({"Error":"The argument 'random_seed' has to be a integer value", "code":400}) #default_parameters["target_cefr"] = form.get("target_cefr", "any") if form.get("use_defaults") in ["true", True, 1, "1"]: parameters.update(default_parameters) criteria.update(default_criteria) elif "use_defaults" in form and form["use_defaults"] != "true": json.dumps({"Error":"'use_defaults' should be set to 'true' for \ using default parameters and criteria for the search.", "code":400}) else: mandatory_params = [] for p in default_parameters: if p in form: parameters[p] = form[p] elif p not in form and p in mandatory_params: json.dumps({"Error":"The arguments 'corpus_list', 'maxhit' \ and 'max_kwics' are mandatory. Set 'use_defaults' to 'true' \ if you don't want to specify these.", "code":400}) for cr in default_criteria: if type(default_criteria[cr]) == dict: for sub_cr in default_criteria[cr]: if sub_cr in form: parameters[sub_cr] = form[sub_cr] elif cr in form: parameters[cr] = form[cr] # Analyze sentences for match mset, json_ms, url = create_mset_json(parameters, criteria) result = json_ms else: result = json.dumps({"Error":"Unknown or missing command. Command \ option(s): hitex, complexity, log","code":404}) print_object(json.loads(result), form) def print_header(): """Prints the JSON header.""" print "Content-Type: application/json" print "Access-Control-Allow-Origin: *" print "Access-Control-Allow-Methods: POST, GET, OPTIONS" print "Access-Control-Allow-Headers: Content-Type" print def print_object(obj, form): """Prints an object in JSON format. The CGI form can contain optional parameters 'callback' and 'indent' which change the output format. """ callback = form.get("callback") if callback: print callback + "(", try: indent = int(form.get("indent")) print json.dumps(obj, sort_keys=True, indent=indent), except: print json.dumps(obj, separators=(",",":")) if callback: print ")", print if __name__ == "__main__": main()