Utilisateur:Jona/process occ.py
Définition, traduction, prononciation, anagramme et synonyme sur le dictionnaire libre Wiktionnaire.
#! /usr/bin/env python # -*- coding: utf-8 -*- #Create two pickles files (dictOcc and listOcc) that will be used by the script generate_table import sys, re, getopt, os, shutil import pickle #TODO: add an interactive mode #TODO: make a more formal usage output and options manager def extract_types(articles): """Extract type of words for each language in text Return a list from a filename""" llang = [] ltype = [] currentLang = '' fin = open(articles,'r') otherTypes = [] for line in fin: # linelangslist = re.findall('\{\{=(\w{2,3})=\}\}', line) #Find langs (e.g. : {{=fr=}}) without ":" or "#" linelangslist = re.findall('\{\{=(.*?)=\}\}', line) #Find langs (e.g. : {{=fr=}}) we try to find the shortest set between {{= and =}} for s in linelangslist: llang.append(s) currentLang=s # print s lineType = re.findall('\{\{-(\S+)-(\|\S+){0,2}\}\}',line) for s in lineType: if s[0] == "nom": ltype.append([s,currentLang]) elif s[0] == "adj": ltype.append([s,currentLang]) elif s[0] == "adjectif": ltype.append([("adj",),currentLang]) elif s[0] == "nom-pr": ltype.append([s,currentLang]) elif s[0] == "pron": ltype.append([s,currentLang]) elif s[0] == "pronom": ltype.append([("pron",),currentLang]) elif s[0] == "verb": ltype.append([s,currentLang]) elif s[0] == "verbe": ltype.append([("verb",),currentLang]) elif s[0] == "adv": ltype.append([s,currentLang]) elif s[0] == "adverbe": ltype.append([("adv",),currentLang]) elif s[0] == "flex-verb": ltype.append([("flex",),currentLang]) elif s[0] == "flex-nom": ltype.append([("flex",),currentLang]) elif s[0] == "flex-adj": ltype.append([("flex",),currentLang]) elif (s[0].find("flex") == 0): ltype.append([("flex",),currentLang]) elif (s[0].find("loc") == 0): ltype.append([("loc",),currentLang]) else: if _debug: if not otherTypes.count(s[0]): otherTypes.append(s[0]) #print s ebauche = re.findall('\{\{ébauche(\|\S{0,3})?\}\}',line) # Can be improved if ebauche: ltype.append([("stub",), currentLang]) #if _debug: #print 'ébauche en %s'% ebauche if _debug: print "Types not computed :" print otherTypes print "\n" fin.close() apack = [ltype, llang] return apack def compute_occ(extendedl): """Compute occurence of each item Return a dict""" occ ={} for e in extendedl: occ[e] = occ.get(e,0) + 1 return occ def compute_occ_subdict(extendedl,subdictTemplate={}): """Compute occurence of each item with a dictionary in a dictionary Return a dict of dicts""" occ = {} for e in extendedl: if occ.get(e[1],0) == 0: #Subdict not yet created occ[e[1]] = dict(subdictTemplate) #Do not copy the reference but create a new one from template occ[e[1]][e[0][0]] = occ.get(e[1],0).get(e[0][0],0) + 1 return occ def file_to_list(nameFile): f = open(nameFile,'r') l =[] for line in f: l.append(line.strip('\n')) f.close() return l def list_to_file(l,nameFile): """Print a list to a file. For list not used in this module, the result can be unexpected""" if type(l) != list: print "Warning: The argument is not a list (list_to_file())" print "Unexpected behavior can occur" f = open(nameFile,'w') for s in l: if type(s) == tuple: f.write(str(s[0])+"\t"+str(s[1])+'\n') elif type(s) == str: f.write(s+'\n') else: print "This format is not supported" f.close() def dict_to_file(d,nameFile): """Print a dict to a file. For dict not used in this module, the result can be unexpected""" if type(d) != dict: print "Warning: The argument is not a dict (dict_to_file())" print "Unexpected behavior can occur" f = open(nameFile,'w') for s in d: f.write(s+' '+d[s]+'\n') f.close() def make_diff(l1,l2): """Make the diff (l2 - l1) Return a list""" s1 = set(l1) s2 = set(l2) sdiff = s2 - s1 ldiff = list(sdiff) return ldiff def retrieve_occ(l,occ): """Join a list of items and number of occurence of those items given in a dict (occ) Return a list of tuple (occurence, item)""" locc = [] for e in l: n = occ.get(e,0) locc.append((n, e)) return locc def usage(): sys.stderr.write("""Options available are\n -h --help Show this help -v --verbose Enter verbose mode -i --input Specify an input directory -o --output Specify an output filename ("langsTableCol" by default) [OBSOLETE, it write now into "listOcc" and "dicOcc"] -d (nothing changing)\n""") def main(argv): global _verbose global _debug _verbose = 0 _debug = 0 inf = '/out' titlef = '/outTitle' inputdir = "." outf = 'langsTableCol' try: opts, args = getopt.getopt(argv, "hvi:o:d", ["help", "verbose", "input=", "output="]) except getopt.GetoptError: sys.stderr.write("Illegal argument\n") usage() sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit(0) elif opt == '-d': _debug = 1 #TODO: put debug condition elif opt in ("-v", "--verbose"): _verbose = 1 #TODO: put verbose condition elif opt in ("-o", "--output"): #TODO: verify that arg is a directory path outf = arg elif opt in ("-i", "--input"): #TODO: verify that arg is a directory path inputdir = arg # It will look if there are old files that has to be backuped if os.path.isfile("dictOcc") or os.path.isfile("listOcc"): doicontinue=raw_input("Ouput file(s) already exist, overwrite ? [C/y/n] (by default copy the old file to *.old; y : to overwrite; n : to abort): ") if doicontinue.lower() == "y": pass elif doicontinue.lower() == "n": print "operation aborted by user" sys.exit(2) else : if os.path.isfile("dictOcc"): shutil.copyfile("dictOcc", "dictOcc.old") if os.path.isfile("listOcc"): shutil.copyfile("listOcc", "listOcc.old") print "Old files copied to *.old" if _verbose: print 'Extracting languages from "%s%s"...'% (inputdir, inf) print "It can now take several minutes..." llinks = extract_types(inputdir+inf) if _verbose: print "Languages extracted..." langsOcc = compute_occ(llinks[1]) if _verbose: print "Languages computed..." typeOcc = compute_occ_subdict(llinks[0],{'nom':0,'nom-pr':0,'adj':0,'verb':0,'adv':0,'flex':0,'loc':0, 'stub':0}) # Create fields to avoid error when a lang missed one if _verbose: print "Type of words computed..." if _debug: print 'fr : ',typeOcc['fr'] print 'nl : ',typeOcc['nl'] print 'ru : ',typeOcc['ru'] fpickle = open('dictOcc','w') p=pickle.Pickler(fpickle) p.dump(typeOcc) fpickle.close() fpickle2 = open('listOcc','w') p2=pickle.Pickler(fpickle2) p2.dump(langsOcc) fpickle.close() if _verbose: print "Pickle files written..." if __name__ == '__main__': main(sys.argv[1:]) #To estimate the time ##import timeit ##t = timeit.Timer("main(sys.argv[1:])", "from __main__ import main") ##print t.repeat(3,5)