# -*- coding: utf-8 -*-

import sys
import re
from array import array

def usage():
	print "Usage: python", sys.argv[0], "<input> <output>" 

space = re.compile(r'\s+|\"|\'|\.|,|“|”|;|:')	

def create_dict(file_name):
    set_words = set()
    dict_words = {}

    with open(file_name) as fp:        
        line = fp.readline()        
        while line:
            # Omitte headers of each document
            if re.match("<doc", line):
                line = fp.readline()
                continue
            
            for w in space.split(line.strip()):
                if len(w) > 0:
                    set_words.add(w)
                
            line = fp.readline()


    inc = 1
    for w in set_words:
        dict_words[w] = inc
        inc += 1
        
    return dict_words

def word_to_int(file_name, dict_words, out_file):
    fp_out = open(out_file, "w")
    n = 0

    with open(file_name) as fp:
        line = fp.readline()     
        while line:
            ids = []
            # Omitte headers of each document
            if re.match("<doc", line):
                line = fp.readline()
                continue
            
            for w in space.split(line.strip()):
                if len(w) > 0:
                    ids += [dict_words[w]]
                
            line = fp.readline()
	    int_ids = array('I', ids)
            int_ids.tofile(fp_out)
            n += len(int_ids)

    fp_out.close()

    return [n, len(dict_words)]
    
if __name__ == "__main__":
    if len(sys.argv) < 3:
	usage()
	exit(1)

    in_file = sys.argv[1]
    out_file = sys.argv[2]
    dict_words = create_dict(in_file)

    [n, sigma] = word_to_int(in_file, dict_words, out_file)

    print "n: " + str(n)
    print "sigma: " + str(sigma)
        
