1
2 from FormattedFileProcessor import FormattedFileProcessor
3 import sets
4
6 """
7 Read/process TSV (tab seperated) formatted files
8 """
12
13 - def process(self, out_method, fields_to_include=None):
14 """
15 Read and process an input file line by line. If out_method is None a dictionary storing read lines are returned.
16 out_method: method to output columns in current line on the fly in tsv format
17 fields_to_include: columns that would be included in the dictionary or processed with the function
18 """
19 file = open(self.input_file_name)
20 line = file.readline()
21 cols = [ c.lower() for c in line.strip().split('\t') ]
22 if fields_to_include is None:
23 first_column = cols[0]
24 else:
25 fields_to_include = [ f.lower() for f in fields_to_include ]
26 first_column = fields_to_include[0]
27 columns = dict(zip(cols, range(len(cols))))
28 id_to_value = {}
29 i=0
30 line = file.readline()
31 while line:
32 vals = line.strip().split('\t')
33 if out_method is None:
34 if fields_to_include is None:
35 id_to_value[vals[columns[first_column]]] = vals
36 else:
37 id_to_value[vals[columns[first_column]]] = [ vals[columns[f]] for f in fields_to_include]
38 else:
39 out_method("%s\n" % "\t".join([ vals[columns[f]] for f in fields_to_include ]))
40 i+=1
41
42
43 line = file.readline()
44 file.close()
45 if out_method is None:
46 if fields_to_include is not None:
47 cols2 = []
48 for c in cols:
49 if c in fields_to_include:
50 cols2.append(c)
51 columns = dict(zip(cols2, range(len(cols2))))
52 return columns, id_to_value
53 else:
54 return
55