#! /usr/bin/env python # Last Change: Mon Aug 20 08:00 PM 2007 J from __future__ import division, print_function, absolute_import import re import itertools import datetime from functools import partial import numpy as np from scipy._lib.six import next """A module to read arff files.""" __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError'] # An Arff file is basically two parts: # - header # - data # # A header has each of its components starting by @META where META is one of # the keyword (attribute of relation, for now). # TODO: # - both integer and reals are treated as numeric -> the integer info # is lost! # - Replace ValueError by ParseError or something # We know can handle the following: # - numeric and nominal attributes # - missing values for numeric attributes r_meta = re.compile('^\s*@') # Match a comment r_comment = re.compile(r'^%') # Match an empty line r_empty = re.compile(r'^\s+$') # Match a header line, that is a line which starts by @ + a word r_headerline = re.compile(r'^@\S*') r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]') r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)') r_attribute = re.compile(r'^@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)') # To get attributes name enclosed with '' r_comattrval = re.compile(r"'(..+)'\s+(..+$)") # To get normal attributes r_wcomattrval = re.compile(r"(\S+)\s+(..+$)") #------------------------- # Module defined exception #------------------------- class ArffError(IOError): pass class ParseArffError(ArffError): pass #------------------ # Various utilities #------------------ # An attribute is defined as @attribute name value def parse_type(attrtype): """Given an arff attribute value (meta data), returns its type. Expect the value to be a name.""" uattribute = attrtype.lower().strip() if uattribute[0] == '{': return 'nominal' elif uattribute[:len('real')] == 'real': return 'numeric' elif uattribute[:len('integer')] == 'integer': return 'numeric' elif uattribute[:len('numeric')] == 'numeric': return 'numeric' elif uattribute[:len('string')] == 'string': return 'string' elif uattribute[:len('relational')] == 'relational': return 'relational' elif uattribute[:len('date')] == 'date': return 'date' else: raise ParseArffError("unknown attribute %s" % uattribute) def get_nominal(attribute): """If attribute is nominal, returns a list of the values""" return attribute.split(',') def read_data_list(ofile): """Read each line of the iterable and put it in a list.""" data = [next(ofile)] if data[0].strip()[0] == '{': raise ValueError("This looks like a sparse ARFF: not supported yet") data.extend([i for i in ofile]) return data def get_ndata(ofile): """Read the whole file to get number of data attributes.""" data = [next(ofile)] loc = 1 if data[0].strip()[0] == '{': raise ValueError("This looks like a sparse ARFF: not supported yet") for i in ofile: loc += 1 return loc def maxnomlen(atrv): """Given a string containing a nominal type definition, returns the string len of the biggest component. A nominal type is defined as seomthing framed between brace ({}). Parameters ---------- atrv : str Nominal type definition Returns ------- slen : int length of longest component Examples -------- maxnomlen("{floup, bouga, fl, ratata}") returns 6 (the size of ratata, the longest nominal value). >>> maxnomlen("{floup, bouga, fl, ratata}") 6 """ nomtp = get_nom_val(atrv) return max(len(i) for i in nomtp) def get_nom_val(atrv): """Given a string containing a nominal type, returns a tuple of the possible values. A nominal type is defined as something framed between braces ({}). Parameters ---------- atrv : str Nominal type definition Returns ------- poss_vals : tuple possible values Examples -------- >>> get_nom_val("{floup, bouga, fl, ratata}") ('floup', 'bouga', 'fl', 'ratata') """ r_nominal = re.compile('{(.+)}') m = r_nominal.match(atrv) if m: return tuple(i.strip() for i in m.group(1).split(',')) else: raise ValueError("This does not look like a nominal string") def get_date_format(atrv): r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$") m = r_date.match(atrv) if m: pattern = m.group(1).strip() # convert time pattern from Java's SimpleDateFormat to C's format datetime_unit = None if "yyyy" in pattern: pattern = pattern.replace("yyyy", "%Y") datetime_unit = "Y" elif "yy": pattern = pattern.replace("yy", "%y") datetime_unit = "Y" if "MM" in pattern: pattern = pattern.replace("MM", "%m") datetime_unit = "M" if "dd" in pattern: pattern = pattern.replace("dd", "%d") datetime_unit = "D" if "HH" in pattern: pattern = pattern.replace("HH", "%H") datetime_unit = "h" if "mm" in pattern: pattern = pattern.replace("mm", "%M") datetime_unit = "m" if "ss" in pattern: pattern = pattern.replace("ss", "%S") datetime_unit = "s" if "z" in pattern or "Z" in pattern: raise ValueError("Date type attributes with time zone not " "supported, yet") if datetime_unit is None: raise ValueError("Invalid or unsupported date format") return pattern, datetime_unit else: raise ValueError("Invalid or no date format") def go_data(ofile): """Skip header. the first next() call of the returned iterator will be the @data line""" return itertools.dropwhile(lambda x: not r_datameta.match(x), ofile) #---------------- # Parsing header #---------------- def tokenize_attribute(iterable, attribute): """Parse a raw string in header (eg starts by @attribute). Given a raw string attribute, try to get the name and type of the attribute. Constraints: * The first line must start with @attribute (case insensitive, and space like characters before @attribute are allowed) * Works also if the attribute is spread on multilines. * Works if empty lines or comments are in between Parameters ---------- attribute : str the attribute string. Returns ------- name : str name of the attribute value : str value of the attribute next : str next line to be parsed Examples -------- If attribute is a string defined in python as r"floupi real", will return floupi as name, and real as value. >>> iterable = iter([0] * 10) # dummy iterator >>> tokenize_attribute(iterable, r"@attribute floupi real") ('floupi', 'real', 0) If attribute is r"'floupi 2' real", will return 'floupi 2' as name, and real as value. >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ") ('floupi 2', 'real', 0) """ sattr = attribute.strip() mattr = r_attribute.match(sattr) if mattr: # atrv is everything after @attribute atrv = mattr.group(1) if r_comattrval.match(atrv): name, type = tokenize_single_comma(atrv) next_item = next(iterable) elif r_wcomattrval.match(atrv): name, type = tokenize_single_wcomma(atrv) next_item = next(iterable) else: # Not sure we should support this, as it does not seem supported by # weka. raise ValueError("multi line not supported yet") #name, type, next_item = tokenize_multilines(iterable, atrv) else: raise ValueError("First line unparsable: %s" % sattr) if type == 'relational': raise ValueError("relational attributes not supported yet") return name, type, next_item def tokenize_single_comma(val): # XXX we match twice the same string (here and at the caller level). It is # stupid, but it is easier for now... m = r_comattrval.match(val) if m: try: name = m.group(1).strip() type = m.group(2).strip() except IndexError: raise ValueError("Error while tokenizing attribute") else: raise ValueError("Error while tokenizing single %s" % val) return name, type def tokenize_single_wcomma(val): # XXX we match twice the same string (here and at the caller level). It is # stupid, but it is easier for now... m = r_wcomattrval.match(val) if m: try: name = m.group(1).strip() type = m.group(2).strip() except IndexError: raise ValueError("Error while tokenizing attribute") else: raise ValueError("Error while tokenizing single %s" % val) return name, type def read_header(ofile): """Read the header of the iterable ofile.""" i = next(ofile) # Pass first comments while r_comment.match(i): i = next(ofile) # Header is everything up to DATA attribute ? relation = None attributes = [] while not r_datameta.match(i): m = r_headerline.match(i) if m: isattr = r_attribute.match(i) if isattr: name, type, i = tokenize_attribute(ofile, i) attributes.append((name, type)) else: isrel = r_relation.match(i) if isrel: relation = isrel.group(1) else: raise ValueError("Error parsing line %s" % i) i = next(ofile) else: i = next(ofile) return relation, attributes #-------------------- # Parsing actual data #-------------------- def safe_float(x): """given a string x, convert it to a float. If the stripped string is a ?, return a Nan (missing value). Parameters ---------- x : str string to convert Returns ------- f : float where float can be nan Examples -------- >>> safe_float('1') 1.0 >>> safe_float('1\\n') 1.0 >>> safe_float('?\\n') nan """ if '?' in x: return np.nan else: return float(x) def safe_nominal(value, pvalue): svalue = value.strip() if svalue in pvalue: return svalue elif svalue == '?': return svalue else: raise ValueError("%s value not in %s" % (str(svalue), str(pvalue))) def safe_date(value, date_format, datetime_unit): date_str = value.strip().strip("'").strip('"') if date_str == '?': return np.datetime64('NaT', datetime_unit) else: dt = datetime.datetime.strptime(date_str, date_format) return np.datetime64(dt).astype("datetime64[%s]" % datetime_unit) class MetaData(object): """Small container to keep useful informations on a ARFF dataset. Knows about attributes names and types. Examples -------- :: data, meta = loadarff('iris.arff') # This will print the attributes names of the iris.arff dataset for i in meta: print i # This works too meta.names() # Getting attribute type types = meta.types() Notes ----- Also maintains the list of attributes in order, i.e. doing for i in meta, where meta is an instance of MetaData, will return the different attribute names in the order they were defined. """ def __init__(self, rel, attr): self.name = rel # We need the dictionary to be ordered # XXX: may be better to implement an ordered dictionary self._attributes = {} self._attrnames = [] for name, value in attr: tp = parse_type(value) self._attrnames.append(name) if tp == 'nominal': self._attributes[name] = (tp, get_nom_val(value)) elif tp == 'date': self._attributes[name] = (tp, get_date_format(value)[0]) else: self._attributes[name] = (tp, None) def __repr__(self): msg = "" msg += "Dataset: %s\n" % self.name for i in self._attrnames: msg += "\t%s's type is %s" % (i, self._attributes[i][0]) if self._attributes[i][1]: msg += ", range is %s" % str(self._attributes[i][1]) msg += '\n' return msg def __iter__(self): return iter(self._attrnames) def __getitem__(self, key): return self._attributes[key] def names(self): """Return the list of attribute names.""" return self._attrnames def types(self): """Return the list of attribute types.""" attr_types = [self._attributes[name][0] for name in self._attrnames] return attr_types def loadarff(f): """ Read an arff file. The data is returned as a record array, which can be accessed much like a dictionary of numpy arrays. For example, if one of the attributes is called 'pressure', then its first 10 data points can be accessed from the ``data`` record array like so: ``data['pressure'][0:10]`` Parameters ---------- f : file-like or str File-like object to read from, or filename to open. Returns ------- data : record array The data of the arff file, accessible by attribute names. meta : `MetaData` Contains information about the arff file such as name and type of attributes, the relation (name of the dataset), etc... Raises ------ ParseArffError This is raised if the given file is not ARFF-formatted. NotImplementedError The ARFF file has an attribute which is not supported yet. Notes ----- This function should be able to read most arff files. Not implemented functionality include: * date type attributes * string type attributes It can read files with numeric and nominal attributes. It cannot read files with sparse data ({} in the file). However, this function can read files with missing data (? in the file), representing the data points as NaNs. Examples -------- >>> from scipy.io import arff >>> from cStringIO import StringIO >>> content = \"\"\" ... @relation foo ... @attribute width numeric ... @attribute height numeric ... @attribute color {red,green,blue,yellow,black} ... @data ... 5.0,3.25,blue ... 4.5,3.75,green ... 3.0,4.00,red ... \"\"\" >>> f = StringIO(content) >>> data, meta = arff.loadarff(f) >>> data array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')], dtype=[('width', '>> meta Dataset: foo \twidth's type is numeric \theight's type is numeric \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black') """ if hasattr(f, 'read'): ofile = f else: ofile = open(f, 'rt') try: return _loadarff(ofile) finally: if ofile is not f: # only close what we opened ofile.close() def _loadarff(ofile): # Parse the header file try: rel, attr = read_header(ofile) except ValueError as e: msg = "Error while parsing header, error was: " + str(e) raise ParseArffError(msg) # Check whether we have a string attribute (not supported yet) hasstr = False for name, value in attr: type = parse_type(value) if type == 'string': hasstr = True meta = MetaData(rel, attr) # XXX The following code is not great # Build the type descriptor descr and the list of convertors to convert # each attribute to the suitable type (which should match the one in # descr). # This can be used once we want to support integer as integer values and # not as numeric anymore (using masked arrays ?). acls2dtype = {'real': float, 'integer': float, 'numeric': float} acls2conv = {'real': safe_float, 'integer': safe_float, 'numeric': safe_float} descr = [] convertors = [] if not hasstr: for name, value in attr: type = parse_type(value) if type == 'date': date_format, datetime_unit = get_date_format(value) descr.append((name, "datetime64[%s]" % datetime_unit)) convertors.append(partial(safe_date, date_format=date_format, datetime_unit=datetime_unit)) elif type == 'nominal': n = maxnomlen(value) descr.append((name, 'S%d' % n)) pvalue = get_nom_val(value) convertors.append(partial(safe_nominal, pvalue=pvalue)) else: descr.append((name, acls2dtype[type])) convertors.append(safe_float) #dc.append(acls2conv[type]) #sdescr.append((name, acls2sdtype[type])) else: # How to support string efficiently ? Ideally, we should know the max # size of the string before allocating the numpy array. raise NotImplementedError("String attributes not supported yet, sorry") ni = len(convertors) def generator(row_iter, delim=','): # TODO: this is where we are spending times (~80%). I think things # could be made more efficiently: # - We could for example "compile" the function, because some values # do not change here. # - The function to convert a line to dtyped values could also be # generated on the fly from a string and be executed instead of # looping. # - The regex are overkill: for comments, checking that a line starts # by % should be enough and faster, and for empty lines, same thing # --> this does not seem to change anything. # 'compiling' the range since it does not change # Note, I have already tried zipping the converters and # row elements and got slightly worse performance. elems = list(range(ni)) for raw in row_iter: # We do not abstract skipping comments and empty lines for # performance reasons. if r_comment.match(raw) or r_empty.match(raw): continue row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) a = generator(ofile) # No error should happen here: it is a bug otherwise data = np.fromiter(a, descr) return data, meta #----- # Misc #----- def basic_stats(data): nbfac = data.size * 1. / (data.size - 1) return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac def print_attribute(name, tp, data): type = tp[0] if type == 'numeric' or type == 'real' or type == 'integer': min, max, mean, std = basic_stats(data) print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std)) else: msg = name + ",{" for i in range(len(tp[1])-1): msg += tp[1][i] + "," msg += tp[1][-1] msg += "}" print(msg) def test_weka(filename): data, meta = loadarff(filename) print(len(data.dtype)) print(data.size) for i in meta: print_attribute(i, meta[i], data[i]) # make sure nose does not find this as a test test_weka.__test__ = False if __name__ == '__main__': import sys filename = sys.argv[1] test_weka(filename)