# -*- coding: utf-8 -*- import sys import re from array import array def usage(): print "Usage: python", sys.argv[0], " " space = re.compile(r'\s+|\"|\'|\.|,|“|”|;|:') def create_dict(file_name): set_words = set() dict_words = {} with open(file_name) as fp: line = fp.readline() while line: # Omitte headers of each document if re.match(" 0: set_words.add(w) line = fp.readline() inc = 1 for w in set_words: dict_words[w] = inc inc += 1 return dict_words def word_to_int(file_name, dict_words, out_file): fp_out = open(out_file, "w") n = 0 with open(file_name) as fp: line = fp.readline() while line: ids = [] # Omitte headers of each document if re.match(" 0: ids += [dict_words[w]] line = fp.readline() int_ids = array('I', ids) int_ids.tofile(fp_out) n += len(int_ids) fp_out.close() return [n, len(dict_words)] if __name__ == "__main__": if len(sys.argv) < 3: usage() exit(1) in_file = sys.argv[1] out_file = sys.argv[2] dict_words = create_dict(in_file) [n, sigma] = word_to_int(in_file, dict_words, out_file) print "n: " + str(n) print "sigma: " + str(sigma)