import codecs import random import arff from dset_proc_aux import cefr_scale def correct_arff(arff_file, labels): """ Corrects the nominal category definition of the output label in an arff file created with the arff package. Args: labels: output labels as a string enclosed in curly brackets e.g. '{lab1, lab2, lab3}' """ with open(arff_file) as f: arff_data = f.readlines() new_arff = [] for i,line in enumerate(arff_data): if line[0] == "@": line_items = line.split(" ") if len(line_items) > 1: if line_items[1] == "level": line_to_change = i new_line = line_items[0] +" "+ line_items[1] + " " + labels + "\n" new_arff.append(new_line) else: new_arff.append(line) else: new_arff.append(line) else: new_arff.append(line) updated_arff = "".join(new_arff) with open(arff_file,"w") as f: f.write(updated_arff) def sk_to_arff(data_file, target_file, arff_file, analysis_level, feature_n_file,num_label,arff_labels): """Transforms feature values from 'data_file' and 'target_file' to arff format usable in Weka using the arff package (http://code.google.com/p/arff/downloads/list) and writes the result to 'arff_file'. Args: data_file: file containing the extracted feature values for each instance target_file: file containing the output label for each instance arff_file: the name of the file where to save the arff formatted result analysis_level: 'sent' or 'text' depending on the level of the readability analysis feature_n_file: a .txt file with the name of each feature num_label (bool): 'True' for numerical labels (usable for regression) 'False' for categorical labels (A1,A2 etc., for classification) arff_labels (str): categorical labels to use in the .arff file """ with open(feature_n_file) as f: fn_str = f.read() feature_names = fn_str.split("\n") feature_names.append("level") # level in the same file if arff format new_data = [] with open(data_file) as data_f: data = [l for l in data_f] labels = [] with open(target_file) as target_f: target_f = codecs.open(target_file) for l in target_f: #change type in file manually to {A1, A2, B1, B2 C1} to get nominal values lbl = l.strip("\n") if num_label: labels.append(cefr_scale[lbl]) #for the integer equivalent else: labels.append(lbl) for i, line in enumerate(data): #id_nb = i+1 new_line_floats = [] for fv in line.split(" "): new_line_floats.append(float(fv)) #new_line_floats.insert(0,id_nb) #to add IDs new_line_floats.append(labels[i]) new_data.append(new_line_floats) #print "Nr of features: ", len(new_data[0]) #header = new_data[0] random.shuffle(new_data) arff.dump(arff_file, new_data, relation='readability', names=feature_names) if not num_label: correct_arff(arff_file, arff_labels) def sk_to_arff_loadless(feature_names, data): #feature names including level arff_data = [feature_names]