#!/usr/bin/python # -*- coding: utf-8 -*- """ larka.py is a CGI interface for a number of different functionalities available in Lärka. http://spraakbanken.gu.se/larka/ Documentation: https://spraakbanken.gu.se/eng/research/infrastructure/l%C3%A4rka/l%C3%A4rka-web-service (incomplete) """ import cgitb, sys, os, cgi, json, codecs, time, re import HitEx.ling_complexity from HitEx.matching_set import MatchingSet from HitEx.auxiliaries.hitex_defaults import default_parameters, default_criteria import database cgitb.enable() ################################################################################ # The version of this script LARKA_VERSION = "1.1" LARKA_VERSION_DATE = "2017-03-21" cwd = os.getcwd() if cwd == '/export/cgi-bin_sb/larkalabb/HitEx' or cwd == '/export/cgi-bin_sb/larka/HitEx': activate_this = os.path.join("/export/cgi-bin_sb/larkalabb", 'venv/bin/activate_this.py') execfile(activate_this, dict(__file__=activate_this)) COMMANDS = ["complexity", "hitex", "log", "info", "cedit_save", "cedit_restore", "cedit_checkkey"] def default_command(form): return "info" # Regular expressions for parsing CGI parameters IS_NUMBER = re.compile(r"^\d+$") IS_PERCENT = re.compile(r"\d\d?$") IS_IDENT = re.compile(r"^[\w\-,|]+$") IS_CRITERIA = r"(filter|ranker|skip)" IS_BOOL = r"(true|True|1)" ################################################################################ # Functions corresponding to the CGI commands def main(): """Main CGI handler. It reads the 'command' parameter and calls the same-named function with the CGI form as argument. Global CGI parameters - command: (default: TO DO) - callback: an identifier that the result should be wrapped in - indent: pretty-print the result with a specific indentation (for debugging) """ starttime = time.time() print_header() # Convert form fields to regular dictionary if os.environ['REQUEST_METHOD'] == 'GET': form = dict((field, cgi.FieldStorage().getvalue(field)) for field in cgi.FieldStorage().keys()) command = form.get("command") else: form = cgi.FieldStorage() command = form.getvalue("command") if not command: command = default_command(form) if command not in COMMANDS: msg = ("'%s' is not a permitted command, try these instead: '%s'" % (command, "', '".join(COMMANDS))) result = {"Error": msg, "code": 501} # raise ValueError(msg) else: assert_key("callback", form, IS_IDENT) assert_key("indent", form, IS_NUMBER) # Calling the requested CGI command result = globals()[command](form) # Adding time information if type(result) == list: for r in result: r["time"]["total_time"] = str(round(time.time() - starttime, 2)) elif command != "info": if "time" not in result: result["time"] = {} result["time"]["total_time"] = str(round(time.time() - starttime, 2)) print_object(result, form) ################################################################################ # Info ################################################################################ def info(form): # TO DO: provide info of functionalities? return {"General information": "Available commands: '%s'." % "', '".join(COMMANDS)} ################################################################################ # TextEval ################################################################################ def complexity(form): """ Linguistic complexity analysis for texts. Includes some simple statistics (e.g. LIX) and Language learning (CEFR) level prediction. """ text = form.get("text", "") # text to analyze produced_by = form.get("produced_by", "") # text written by 'learner' or 'expert' if not text: complexity_analysis = {"Error": "Missing value for 'text'"} elif not produced_by: complexity_analysis = {"Error": "Missing value for 'produced_by'"} elif produced_by not in ["learner", "expert"]: complexity_analysis = {"Error": "Wrong value for 'produced_by', choose 'learner' or 'expert'."} else: ref_level = form.get("ref_level", "B1") # CEFR level of test or learner if known produced_by = form.get("produced_by", "learner") # text written by 'learner' or 'expert' CEFR_ML = int(form.get("CEFR_ML", "0")) complexity_analysis = HitEx.ling_complexity.analyze_lg_complexity(text, ref_level, produced_by, "single_text", CEFR_ML) return complexity_analysis ################################################################################ # HitEx ################################################################################ def hitex(form): parameters = {} criteria = {} # Argument types and their accepted values arg_types = { "query_w": r"(\w+|\[.*\])", "query_type": r"(lemma|wordform|cqp)", "query_pos": r"(AB|DT|HA|HD|HP|HS|IE|IN|JJ|KN|NN|PC|PL|PM|PN|PP|PS|RG|RO|SN|UO|VB)", # POS tags form: https://spraakbanken.gu.se/korp/markup/msdtags.html "corpus_list": IS_IDENT, "max_kwics": IS_NUMBER, "maxhit": IS_NUMBER, "target_edge": r"(start|end)", "proportion": IS_PERCENT, "target_cefr": r"(A|B|C)(1|2)", "voc_thr": IS_PERCENT, "min_len": IS_NUMBER, "max_len": IS_NUMBER, "non_alpha_thr": IS_PERCENT, "non_lemmatized_thr": IS_PERCENT, "sensitive_voc_cats": IS_IDENT, "preserve_bad": IS_BOOL, "random_seed": IS_NUMBER} mandatory_params = ['query_w', 'query_type'] non_default_mand = ['corpus_list', 'maxhit', 'max_kwics'] # Use detaults if specified if form.get("use_defaults"): # in ["true", True, 1, "1"] use_defaults = True else: use_defaults = False mandatory_params += non_default_mand # Validate and interpret HitEx parameters for p in arg_types: is_mandatory = False if p in form: if p in mandatory_params: is_mandatory = True assert_key(p, form, arg_types[p], is_mandatory) if arg_types[p] == IS_IDENT: parameters[p] = form[p].split(",") if p == "sensitive_voc_cats": for cat in parameters[p]: if cat not in ["sex", "violence", "other", "religion", "secretion", "all"]: return {"Error": "The argument 'sensitive_voc_cats' has to be one or \ more of the following: 'violence', 'other', 'religion', \ 'secretion','all'", "code": 400} elif arg_types[p] in [IS_NUMBER, IS_PERCENT]: parameters[p] = int(form[p]) else: parameters[p] = form[p] elif arg_types[p] == IS_BOOL: assert_key(p, form, arg_types[p], is_mandatory) parameters[p] = False elif use_defaults and p != "random_seed": parameters[p] = default_parameters[p] # Validate and interpret HitEx criteria for cr in default_criteria: if type(default_criteria[cr]) == dict: for sub_cr in default_criteria[cr]: if sub_cr in form and form.get(sub_cr) != "skip": # "" same effect as "skip" assert_key(sub_cr, form, IS_CRITERIA) if criteria.get(cr): criteria[cr][sub_cr] = form[sub_cr] else: criteria[cr] = {} criteria[cr][sub_cr] = form[sub_cr] elif use_defaults and sub_cr not in form: if criteria.get(cr): criteria[cr][sub_cr] = default_criteria[cr][sub_cr] else: criteria[cr] = {} criteria[cr][sub_cr] = default_criteria[cr][sub_cr] elif cr in form: if cr == "readability" and not form.has_key("target_cefr"): return {"Error": "To use 'readability' as a criteria, specify \ 'target_cefr' (A1, A2, B1, B2 or C1)", "code": 400} assert_key(cr, form, IS_CRITERIA) criteria[cr] = form[cr] elif use_defaults and form.get(cr) != "skip": criteria[cr] = default_criteria[cr] # Analyze sentences for match ms = MatchingSet(parameters, criteria) ms.create_set() return ms.to_obj() ################################################################################ # Log ################################################################################ def log(form): "TO DO: change to database logging" timestamp = "\n" + time.strftime("%H:%M:%S") + "\n" text_to_log = timestamp + form.get("text", "") # .decode("utf-8") log_type = form.get("log_type", "") if text_to_log and log_type in ["exe_linguists", "liwrix"]: log_file = "logs/%s.txt" % log_type os.chmod(log_file, 0o664) with codecs.open(log_file, "a") as f: f.write(str(text_to_log)) result = {"result": "Result logged.", "code": 202} else: result = {"Error": "Wrong value for 'log_type'. Available options: exe_linguists, liwrix", "code": 400} return result ################################################################################ ################################################################################ # Cedit save/restore ################################################################################ def cedit_save(form): userkey = form.getvalue("userkey", "") lastposition = form.getvalue("lastposition", "") content = form.getvalue("content", "") result = database.cedit_save(userkey, lastposition, content) return result def cedit_restore(form): userkey = form.get("userkey", "") result = database.cedit_restore(userkey) return result def cedit_checkkey(form): userkey = form.get("userkey", "") result = database.cedit_checkkey(userkey) return result ################################################################################ # Helper functions - based on korp.cgi (http://spraakbanken.gu.se/korp/) def assert_key(key, form, regexp, required=False): """Check that the value of the attribute 'key' in the CGI form matches the specification 'regexp'. If 'required' is True, then the key has to be in the form. """ try: value = form.get(key, "") except AttributeError: value = form.getvalue(key, "") if value and not isinstance(value, list): value = [value] if required and not value: msg = "Key is required: %s" % key # return {"Error":msg, "code":400} raise KeyError("Key is required: %s" % key) if not all(re.match(regexp, x) for x in value): pattern = regexp.pattern if hasattr(regexp, "pattern") else regexp msg = "Value(s) for key %s do(es) not match /%s/: %s" % (key, pattern, value) # return {"Error":msg, "code":400} raise ValueError("Value(s) for key %s do(es) not match /%s/: %s" % (key, pattern, value)) def print_header(): """Prints the JSON header.""" print "Content-Type: application/json" print "Access-Control-Allow-Origin: *" print "Access-Control-Allow-Methods: POST, GET, OPTIONS" print "Access-Control-Allow-Headers: Content-Type" print def print_object(obj, form): """Prints an object in JSON format. The CGI form can contain optional parameters 'callback' and 'indent' which change the output format. """ try: callback = form.get("callback") except AttributeError: callback = form.getvalue("callback") if callback: print callback + "(", try: indent = int(form.get("indent")) print json.dumps(obj, sort_keys=True, indent=indent), except: print json.dumps(obj, separators=(",", ":")) if callback: print ")", print # TO DO? prevent_timeout() - see korp.cgi if __name__ == "__main__": main()