Utilisateur:FtiercelBot/tranInter.py

tranInter.py
#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot goes over multiple pages of the home wiki, and edits them without
changing. This is for example used to get category links in templates
working.

Don't forget to set the ftout to your current list of words,
see below for a line that looks like:
ftout =open('/home/cmillet/wikitruks/wiktio/all/2005-12-14.txt', 'r')

This script understands various command-line arguments:

    -start:        used as -start:page_name, specifies that the robot should
                   go alphabetically through all pages on the home wiki,
                   starting at the named page.

    -file:         used as -file:file_name, read a list of pages to treat
                   from the named textfile. Page titles should be enclosed
                   in [[double-squared brackets]].

    -ref:          used as -start:page_name, specifies that the robot should
                   touch all pages referring to the named page.

    -cat:          used as -cat:category_name, specifies that the robot should
                   touch all pages in the named category.

All other parameters will be regarded as a page title; in this case, the bot
will only touch a single page.
"""
import wikipedia, wiktionary, pagegenerators, catlib
import sys
import re

tradMsg = "{{-trad-}}"
commentCompiler = re.compile( u"\<\!\-\-(.*?)\-\-\>", re.DOTALL | re.MULTILINE)
translntLineCompiler = re.compile("(^:?\* *\{\{(\w*?)\}\}(.*?\n))",re.MULTILINE)
oldLinkCompiler = re.compile("\[\[(.*?)\]\]")
'''
listelng = ['als','an','am','ar','as','ast','ay','az','bm','de','dog','el','en','eo','es','et','fr','gu','hi','io','ja','it','ko','ln','nl','oc','vi','pt','ru','sv','tr','ia','kn','pl','ta','la','ur','fi','bn','ga','grc','hu','pa','fa','sl','bo','uk','af','sq','hy','br','bg','ca','chr','da','gl','ka','he','ku','lv','li','mn','ro','sa','sr','scn','tt','cs','th','te','yi','or','ml','zh','zu']
# retrait de dog : pas de wiktionnaire dans cette langue
nowiktiolng = ['dog','fil','grc']

'''

# Wiktionaries that makes the distinction between this and This :
# (the list http://meta.wikimedia.org/wiki/Help:Page_name is not really uptodate)
nocaplng = ['af','bg','cs','de','en','eo','es','fa','fi','fo','fr','gu','hi','hr','hu','is','it','ja','ka','kn','ku','ml','nl','sa','scn','sq','sv','sw','tr','vi']

#Wiktionary I checked that still capitalize there entries:
# ln -- pt

# ftout MUST BE SET correctly

wordList = {}

ftout =open('./2007-7-25.txt', 'r')
for line in ftout:
  language, translation = line.split(":",1)
  if language not in wordList:
    wordList[language] = []
  wordList[language].append(translation)
ftout.close()

wikipedia.setAction(u'interwikification des traductions (modèle trad)')

                            # si c'est {{-... ou [[... alors on entre dans une autre section, ou fin de liste
                            # if re.compile("^(\{\{-|\[\[)",re.M).match(newtext,curIdx):


class TranslationBot:
  def __init__(self, generator, acceptall = False):
    self.generator = generator
    self.acceptall = acceptall
    
  def run(self):
    for page in self.generator:
      try:
        wikipedia.output('page: %s' % page.title())
        thePage = page.get()
        theChangedPage = thePage # as newtext, but without comment
        # removing <!-- -->
        oldText = commentCompiler.sub(u"", thePage)
        # We need to do something here
        newText = oldText
        curIdx = newText.find(tradMsg, 0)
        while curIdx != -1:
          curIdx += len(tradMsg)
          # (eventually one ":") one *, one whitespace, one {{language code}}, then links, then newline char
          result = translntLineCompiler.search(oldText,curIdx)
          while result:
            completeLine = result.group(1)
            lang = result.group(2)
            analyzedPart = result.group(3)
            newLine = completeLine

            pattern = u'\{\{trad\|%s\|(.*?)\}\}'%lang
            transList = re.findall(pattern, analyzedPart)
            for translt in transList :
              # we are unable to process the cases in which there is #
              if '#' in translt:
                continue
              wikipedia.output(u'recherche de "%s:%s"'%(lang,translt) )
              tosearch = u'%s\n'%translt
              tosearch = tosearch.encode('utf-8')
              if lang not in wordList or tosearch not in wordList[lang]:
                print "DEWIKIFICATION"
                new = u'{{trad-|%s|%s}}'%(lang,translt)
                old = u'{{trad|%s|%s}}'%(lang,translt)
                newLine = newLine.replace(old , new)
              
            pattern = u'\{\{trad-\|%s\|(.*?)\}\}'%lang
            transList = re.findall(pattern , analyzedPart )
            for translt in transList :
              if '#' in translt:
                continue
              wikipedia.output(u'recherche de "%s:%s"'%(lang,translt) )
              tosearch = u'%s\n'%translt
              tosearch = tosearch.encode('utf-8')
              if lang in wordList and tosearch in wordList[lang]:
                print "INTERWIKIFICATION"
                old = u'{{trad-|%s|%s}}'%(lang,translt)
                new = u'{{trad|%s|%s}}'%(lang,translt)
                newLine = newLine.replace(old , new)
              
            transList = oldLinkCompiler.findall(analyzedPart)
            for translt in transList :
              if '#' in translt:
                continue
              wikipedia.output(u'recherche de "%s:%s"'%(lang,translt) )
              tosearch = u'%s\n'%translt
              tosearch = tosearch.encode('utf-8')
              if lang in wordList and tosearch in wordList[lang]:
                print "INTERWIKIFICATION"
                old = u'[[%s]]'%translt
                new = u'{{trad|%s|%s}}'%(lang,translt)
                newLine = newLine.replace(old , new)
              else:
                print "REDEWIKIFICATION"
                old = u'[[%s]]'%translt
                new = u'{{trad-|%s|%s}}'%(lang,translt)
                newLine = newLine.replace(old , new)
              
              # end of line analyze
            newText = newText.replace(completeLine, newLine)
            theChangedPage = theChangedPage.replace(completeLine, newLine)
            curIdx = result.end(3)
            result = translntLineCompiler.search(oldText,curIdx)
            # end of if not result
          curIdx = newText.find(tradMsg, curIdx)
          # end of while we are in the translation section
        # end of while {{-trad-}}
        
        # we upload the text
        if newText == oldText:
          wikipedia.output('No changes were necessary in %s' % page.title())
        else:
          wikipedia.output(u'>>> %s <<<' % page.title())
          wikipedia.showDiff(thePage, theChangedPage)
          if not self.acceptall:
              choice = wikipedia.inputChoice(u'Do you want to accept these changes?',  ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
              if choice in ['a', 'A']:
                  self.acceptall = True
          if self.acceptall or choice in ['y', 'Y']:
              print "put"
              page.put(theChangedPage)
      
      except wikipedia.NoPage:
          print "Page %s does not exist?!?!"%page.aslink()
      except wikipedia.IsRedirectPage:
          pass
      except wikipedia.LockedPage:
          pass

def main():
    #page generator
    gen = None
    pageTitle = []
    for arg in sys.argv[1:]:
        arg = wikipedia.argHandler(arg, 'touch')
        if arg:
            if arg.startswith('-start:'):
                gen = pagegenerators.AllpagesPageGenerator(arg[7:])
            elif arg.startswith('-ref:'):
                referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.ReferringPageGenerator(referredPage)
            elif arg.startswith('-links:'):
                linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:])
                gen = pagegenerators.LinkedPageGenerator(linkingPage)
            elif arg.startswith('-file:'):
                gen = pagegenerators.TextfilePageGenerator(arg[6:])
            elif arg.startswith('-cat:'):
                cat = catlib.Category(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.CategorizedPageGenerator(cat)
            else:
                pageTitle.append(arg)

    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])
    if not gen:
        wikipedia.showHelp('touch')
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = TranslationBot(preloadingGen)
        bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()