#! /usr/bin/env python
# -*- coding: utf-8 -*-
#Take a dump from a wikimedia project and remove all the non-main namespace pages.
#Return two files:
#One 'out' include all the article in the main namespace (those without ":" after the first character).
#The second 'outTitle' is a list (one title per line) of all the page titles.
from __future__ import unicode_literals
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
import sys, os, getopt, shutil, bz2, mimetypes
#TODO: Avoid parsing into html comments <!-- -->
class PreProcHandler(ContentHandler):
def __init__ (self):
self.isTitle, self.isText, self.isFirstText = 0,0,1
def startDocument(self):
self.f = open(outputdir+'/out', 'w')
self.fTitle = open(outputdir+'/outTitle', 'w')
def endDocument(self):
self.f.close()
self.fTitle.close()
# Opening tag found, change flags
def startElement(self, name, attrs):
if name == 'title':
self.isTitle = 1
self.isFirstText = 1
self.title = ""
elif (name == 'text' and self.isFirstText == 1):
self.isText = 1
self.text = ""
return
# Fill the buffer
def characters(self, ch):
if self.isTitle == 1:
self.title += ch
if (self.isText == 1 and self.isFirstText == 1):
self.text +=ch
# Closing tag found, change flags and write buffers
def endElement(self,name):
if name == 'title':
self.isTitle = 0
elif name == 'text':
self.isText = 0
self.isFirstText = 0
elif name == 'page':
indexBeforeColon = self.title.find(":",1,-1)
if (indexBeforeColon == -1 or (self.title[0:indexBeforeColon] not in prefixList)):
self.f.write(unicode('<title>').encode("utf-8")+self.title.encode("utf-8")+unicode('</title>\n').encode("utf-8")+self.text.encode("utf-8")+unicode('\n').encode("utf-8"))
self.fTitle.write(self.title.encode("utf-8")+unicode('\n').encode("utf-8"))
#print '<title>'+self.title+'</title>\n',self.text
if _debug :
# Show prefix in title (like "talk" in "talk:dog") but not in the current list
prefix = self.title[0:indexBeforeColon]
if (indexBeforeColon != -1 and (prefix not in prefixList)):
print "[DEBUG]", prefix," , like in ",self.title
def usage():
sys.stderr.write("""Options available are\n
-h --help Show this help
-i --input Give the input file (it's "xml_articles.xml" by default)
-o --ouput The output directory, where "out" and "outTitle" will be written
-v --verbose (nothing changing)
-d show debug info\n""")
def main(argv):
global outputdir
global _verbose
global _debug
_verbose = 0
_debug = 0
global prefixList
prefixList = ["Wiktionnaire", "MediaWiki", "Annexe", "Modèle", "Fichier", "Aide", "Thésaurus", "Catégorie", "Projet", "Portail", "Transwiki", "WT"]
source = "xml_articles.xml"
outputdir = "."
try:
opts, args = getopt.getopt(argv, "hi:o:vd", ["help", "input=", "output=","verbose"])
except getopt.GetoptError:
sys.stderr.write("Illegal argument\n")
usage()
sys.exit(2)
#TODO: add a condition like "-c" (copy) to have an inputless script
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit(0)
elif opt == '-d':
_debug = 1
elif opt in ("-v", "--verbose"):
_verbose = 1
#TODO: put verbose condition
elif opt in ("-i", "--input"):
source = arg
elif opt in ("-o", "--output"):
if not os.path.isdir(arg):
sys.stderr.write(" '%s' is not a directory\n"% arg)
usage()
sys.exit(2)
outputdir = arg
if not os.path.isfile(source):
sys.stderr.write("The input file '%s' was not found\n"% source)
usage()
sys.exit(2)
inputType = mimetypes.guess_type(source)
if (inputType[0] != "application/xml"):
print "Input file is of type : %s; This is strange" % str(inputType)
# It will look if there are old files that has to be backuped
# If it find ones, it will ask if they have to be copied (default), overwrite (yes) or quit the program (no)
if os.path.isfile(outputdir+"/out") or os.path.isfile(outputdir+"/outTitle"):
doicontinue=raw_input("Ouput file(s) already exist, overwrite ? [C/y/n] (by default copy the old file to *.old; y : to overwrite; n : to abort): ")
if doicontinue.lower() == "y":
pass
elif doicontinue.lower() == "n":
print "operation aborted by user"
sys.exit(2)
else :
if os.path.isfile(outputdir+"/out"):
shutil.copyfile(outputdir+"/out", outputdir+"/out.old")
if os.path.isfile(outputdir+"/outTitle"):
shutil.copyfile(outputdir+"/outTitle", outputdir+"/outTitle.old")
print "Old files copied to *.old"
print "It can now take several minutes..."
parser = make_parser()
curHandler = PreProcHandler()
parser.setContentHandler(curHandler)
if (inputType[1] == "bzip2"):
parser.parse(bz2.BZ2File(source,'r'))
elif (inputType [1] == None):
parser.parse(source)
else :
sys.stderr.write("The input file type '%s' was not recognized\n"% inputType[1])
usage()
sys.exit(2)
if __name__ == "__main__":
main(sys.argv[1:])