Utilisateur:PamputtBot/parse dump.cpp

Ce code c++ lit ligne par ligne le dump d’historique complet du Wiktionnaire que l’on a précédemment récupéré ici et décompressé dans le dossier de parse_dump.cpp.
On compile le programme avec la commande suivante
g++ parse_dump.cpp -std=c++17 -o parse_dump
puis on l’exécute en tapant « ./parse_dump frwiktionary-latest-pages-articles.xml ».
Ce programme crée autant de fichier de sortie au format texte qu’il y a d’analyse. Le contenu de chaque fichier peut ensuite être copié dans différentes sous-pages de Wiktionnaire:Maintenance et nettoyage.
/*
  g++ parse_dump.cpp -std=c++17 -o parse_dump
  ./parse_dump /chemin/vers/frwiktionary-latest-pages-articles.xml
 */

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <unordered_set>
#include <unordered_map>
#include <map>
#include <string>
#include <algorithm> 
#include <cctype>
#include <locale>
 
using namespace std;

string getCodeLangue(string line) {
  size_t pos1 = line.find("{{langue|")+9;
  size_t pos2 = line.find("}}", pos1);
  return line.substr(pos1,pos2-pos1);
}

// trim from start (in place)
static inline void ltrim(std::string &s) {
    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
        return !std::isspace(ch);
    }));
}

// trim from end (in place)
static inline void rtrim(std::string &s) {
    s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
        return !std::isspace(ch);
    }).base(), s.end());
}

// trim from both ends (in place)
static inline void trim(std::string &s) {
    rtrim(s);
    ltrim(s);
}

// trim from start (copying)
static inline std::string ltrim_copy(std::string s) {
    ltrim(s);
    return s;
}

// trim from end (copying)
static inline std::string rtrim_copy(std::string s) {
    rtrim(s);
    return s;
}

// trim from both ends (copying)
static inline std::string trim_copy(std::string s) {
    trim(s);
    return s;
}

 
int main(int argc, char **argv) {

  if(argc == 1 || argc>2) {
    cout << argv[0] << " /chemin/vers/frwiktionary-latest-pages-articles.xml" << endl;
    return 0;
  }
  string nomFichier = argv[1];
  
  //https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-articles.xml.bz2
  ifstream infile(nomFichier.c_str(),ifstream::in);
  if(!infile) {
    cout << "Le fichier " << nomFichier << " n'existe pas" << endl;
    return 0;
  }

  string nomFichierSortie = "mot_feminin_sans_exemple.txt";
  ofstream outMotsFemininsSansExemple(nomFichierSortie.c_str(), ofstream::out);
  if(!outMotsFemininsSansExemple) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "definition_masculin_ou_feminin.txt";
  ofstream outDefinitionMasculinOuFeminin(nomFichierSortie.c_str(), ofstream::out);
  if(!outDefinitionMasculinOuFeminin) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "genre_et_equi-pour_pas_coherent.txt";
  ofstream outGenreEquivpourPasCoherent(nomFichierSortie.c_str(), ofstream::out);
  if(!outGenreEquivpourPasCoherent) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "trad_hors_section_en_francais.txt";
  ofstream outTradHorsSectionFr(nomFichierSortie.c_str(), ofstream::out);
  if(!outTradHorsSectionFr) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "noms_sans_genre.txt";
  ofstream outNomsSansGenre(nomFichierSortie.c_str(), ofstream::out);
  if(!outNomsSansGenre) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "entrees_sans_aucune_definition.txt";
  ofstream outSansDefinition(nomFichierSortie.c_str(), ofstream::out);
  if(!outSansDefinition) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "debut_incorrect.txt";
  ofstream outDebutIncorrect(nomFichierSortie.c_str(), ofstream::out);
  if(!outDebutIncorrect) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "code_langue_incoherent_etyl.txt";
  ofstream outEtylIncoherent(nomFichierSortie.c_str(), ofstream::out);
  if(!outEtylIncoherent) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "dicoado_mauvaise_section.txt";
  ofstream outDicoado(nomFichierSortie.c_str(), ofstream::out);
  if(!outDicoado) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  nomFichierSortie = "section_vide.txt";
  ofstream outSectionVide(nomFichierSortie.c_str(), ofstream::out);
  if(!outSectionVide) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
 
  std::string titre;
  std::vector<std::string> codesLangue;
  std::unordered_set<std::string> tousLesMots;
  std::vector<std::pair<std::string, std::string>> valeursEquivPour;
  std::unordered_map<std::string, std::pair<std::string, std::string>> etylLanguesMap;

  std::string ns;
 
  string line;
  size_t pos1, pos2;
  unsigned int cmpt = 0;

  while(getline(infile,line)) {
    if(line.find("<page>")!=string::npos) {
      titre = "";
      codesLangue.clear();
      ns = "";
      bool is_redirect = false;
      bool balise_text = false;
      bool is_debut_incorrect = false;
      bool is_first_line = false;
      bool is_nom = false;
      bool f_and_equivpour = false;
      bool m_and_equivpour = false;
      bool equiv_pour_et_genre_pas_coherent = false;
      bool has_example = false;
      bool is_fr = false;
      bool is_habitante = false;
      bool is_masculin_or_feminin = false;
      bool trad_pas_fr = false;
      bool pas_de_genre = false;
      std::unordered_map <string, bool> juste_ebauche_def;
      bool is_fr_reg_x = false;
      bool is_etyl_incoherent = false;
      bool dicoado_mauvaise_section = false;
      std::vector <std::string> sections_vides;
      std::vector <std::string> sections_vides_langue;
      std::string nom_section;
      std::string contenu_section;
      std::string langue;
      
      while(getline(infile,line)) {
	if(line.find("<title>")!=string::npos) {
	  // on extrait le titre des balises <title>
	  pos1  = line.find("<title>");
	  pos2  = line.find("</title>");
	  titre = line.substr(pos1+7,pos2-pos1-7);
	  tousLesMots.insert(titre);
	} 

	if(line.find("<ns>")!=string::npos) {
	  //on vérifie que l'espace de nom est bien « 0 »
	  pos1 = line.find("<ns>");
	  pos2 = line.find("</ns>");
	  ns   = line.substr(pos1+4,pos2-pos1-4);
	}

	if(line.find("<text")!=string::npos) {
	   balise_text = true;
	   is_first_line = true;
	}

	if((line.find("#REDIRECT")!=string::npos ||
	   line.find("#redirect")!=string::npos ||
	   line.find("#Redirect")!=string::npos) &&
	   balise_text)
	  is_redirect = true;

	if(is_first_line) {
	  // <text bytes="2157" xml:space="preserve">WIKICODE
	  size_t pos = line.find(">");
	  if(line.find("=={") > pos+3 && // pour =={{langue|
	     line.find("== ") > pos+3 && // pour == {{langue| 
	     line.find("{{v") > pos+3 && // pour {{voir|x|y|z}})
	     line.find("{{t") > pos+3 && // pour {{titre incorrect|xxxxx}} ou {{tire mis en forme|xxxxx}})
	     line.find("{{s") > pos+3 && // pour {{supprimer ?}} ou {{PàS}}
	     line.find("{{P") > pos+3 && // pour {{PàS}}
	     line.find("{{à") > pos+3 && // pour {{à supprimer}}
	     line.find("{{f") > pos+3 && // pour {{formater}}
	     line.find("{{é") > pos+3 && // pour {{ébauche}}
	     line.find("{{a") > pos+3 && // pour {{alphabet
	     line.back() != '>' // pour une ligne vide
	     ) {
	    is_debut_incorrect = true;
	  }
	}
	
	if(line.find("{{langue|")!=string::npos &&
	   line.find("==")!=string::npos &&
	   balise_text) {
	  langue=getCodeLangue(line);
	  if (langue == "fr")
	    is_fr = true;
	  else
	    is_fr = false;

	  juste_ebauche_def[langue] = true;
	}

	
	// le dico des ados
	if(!is_fr &&
	   (line.find("{{Le Dico des Ados") != string::npos ||
	    line.find("{{Dicoado") != string::npos)) 
	  dicoado_mauvaise_section = true;

	// is_nom
	if(line.find("===")!=string::npos &&
	   balise_text)
	  if(line.find("{{S|nom|")!=string::npos)
	    is_nom = true;
	  else
	    is_nom = false;

	
	// section vide ?
	if(line.find("===")!=string::npos &&
	   balise_text) {
	  if(line.find("{{S|")!=string::npos) {
	    bool is_pron=false;
	    bool is_ref=false;
	    bool is_trad=false;
	    std::string section_prec = nom_section;
	    // on vérifie d'abord le contenu de la section précédente
	    if(trim_copy(contenu_section).empty()) {
	      if(nom_section == "prononciation")
		is_pron = true;
	      if(nom_section == "références")
		is_ref=true;
	      if(nom_section == "traductions")
		is_trad=true;

	      if(!is_pron &&
		 !is_ref &&
		 !is_trad) {
		sections_vides.push_back(nom_section);
		sections_vides_langue.push_back(langue);
	      }
	    }
	    
	    // on réinitialise contenu_section
	    contenu_section = "";

	    // puis, on extrait le nom de la nouvelle section
	    pos1 = line.find("{{S|") + 4;
	    pos2 = line.find("|", pos1+1);
	    if(pos2 == string::npos)
	      pos2 = line.find("}}", pos1+1);
	    nom_section = line.substr(pos1, pos2-pos1);


	    if(is_pron &&
	       nom_section != "paronymes" &&
	       nom_section != "homophones") {
		sections_vides.push_back(section_prec);
		sections_vides_langue.push_back(langue);
	      }
	    
	    if(is_ref &&
	       nom_section != "sources" &&
	       nom_section != "bibliographie") {
		sections_vides.push_back(section_prec);
		sections_vides_langue.push_back(langue);
	      }
	    
	    if(is_trad &&
	       nom_section != "traductions à trier") {
		sections_vides.push_back(section_prec);
		sections_vides_langue.push_back(langue);
	      }
	      
	  }
	} else // ce n'est pas une section de titre
	  contenu_section += line;
	
	// trad_pas_fr
	if(line.find("====")!=string::npos &&
	   balise_text &&
	   !is_fr)
	  if(line.find("{{S|trad")!=string::npos)
	    trad_pas_fr = true;

	// nom sans genre sur la ligne de forme
	const string bold_title = "'''" + titre + "'''";
	if(line.find(bold_title) != string::npos && // titre en gras
	   line.find("{{pron|") != string::npos && // le modèle « pron » est normalement présent sur la ligne de forme
	   line.find("#") == string::npos && // pas un exemple
	   line.find(":") == string::npos && // pas une note
	   line.find("*") == string::npos && // pas une note
	   line.find(";") == string::npos && // pas une note
	   line.find("{{note}}") == string::npos && // pas une note
	   // line.find("|") == string::npos && // pas un exemple sur plusieurs ligne
	   line.find("/") == string::npos && // pas un exemple sur plusieurs ligne
	   line.find("&gt;") == string::npos && // pas un exemple sur plusieurs ligne
	   line.find("[[") == string::npos && // pas une image
	   line.find("]]") == string::npos && // pas une image
	   is_nom &&
	   is_fr) {
	  if(line.find("{{f}}") == string::npos &&
	     line.find("{{m}}") == string::npos &&
	     line.find("{{mf") == string::npos &&
	     line.find("{{fm") == string::npos &&
	     line.find("{{genre") == string::npos)
	    // cout << line << endl;
	    pas_de_genre = true;
	}

	
	size_t pos_equiv_pour = line.find("{{équiv-pour");
	if(pos_equiv_pour != string::npos) {
	  // féminin
	  size_t pos_f = line.find("{{f}}");
	  if(pos_f != string::npos) {
	    f_and_equivpour = true;

	    if (line.find("|une ") != string::npos &&
		line.find("une personne") == string::npos &&
		line.find("une entité") == string::npos &&
		pos_f < pos_equiv_pour) {
	      // std::cout << titre << " -> " << line << std::endl;
	      equiv_pour_et_genre_pas_coherent = true;
	    }
	  }

	  // masculin
	  size_t pos_m = line.find("{{m}}");
	  if(pos_m != string::npos) {
	    m_and_equivpour = true;

	    if (line.find("|un ") != string::npos &&
		line.find("une personne") == string::npos &&
		line.find("une entité") == string::npos &&
		pos_m < pos_equiv_pour)
	      equiv_pour_et_genre_pas_coherent = true;
	  }

	  if(is_fr) {
	    // on extrait le(s) mot(s) contenu(s) dans le modèle « équiv-pour »
	    size_t fin_equiv_pour = line.find("}}", pos_equiv_pour);
	    stringstream ss(line.substr(pos_equiv_pour, fin_equiv_pour-pos_equiv_pour));
	    vector<string> tokens;
	    string mot;
	    unsigned cmpt=0;
	    while(getline(ss, mot, '|')) {
	      if(mot.find("=") != string::npos) continue;
	      if(cmpt++<2) continue;

	      if(mot.empty()) {
		//cout << titre << " -> " << line << endl;
		continue;
	      }
	      
	      valeursEquivPour.push_back(std::make_pair(mot, titre));
	    }
	  }
	}

	if((line.find("{{exemple") != string::npos ||
	    line.find("#*") != string::npos) &&
	   line.find("{{ébauche-exe") == string::npos &&
	   is_fr)
	  has_example = true;

	if(line.find("#") != string::npos &&
	   (line.find("abitante") != string::npos ||
	    line.find("abitant]]e") != string::npos) &&
	   is_fr)
	  is_habitante = true;

	if(line.find("#") != string::npos &&
	   is_nom &&
	   (line.find("''Masculin") != string::npos ||
	    line.find("''Fémini") != string::npos) &&
	   is_fr)
	  is_masculin_or_feminin = true;

	// Codes langue incohérents dans étyl
	size_t pos0 = 0;
	while(line.find("{{étyl|", pos0+1) != string::npos) {
	  // {{étyl|la|fr|-atio|sens=[[-ation]]}}.
	  pos0 = line.find("{{étyl|", pos0+1); // {{étyl|
	  size_t pos1 = line.find("|", pos0+8); // la|
	  size_t pos2a = line.find("|", pos1+1); // fr|
	  size_t pos2b = line.find("}}", pos1+1); // fr|
	  size_t pos2 = std::min(pos2a, pos2b);

	  std::string lang1 = line.substr(pos0+8, pos1-pos0-8);
	  std::string lang2 = line.substr(pos1+1, pos2-pos1-1);
	  // std::cout << line << ", lang1: " << lang1 << ", lang2: " << lang2 << std::endl;

	  if(lang2 != langue
	     && balise_text
	     && ns=="0"
	     && !is_redirect 
	     && titre.find("Titres non pris en charge") == string::npos) {
	    if(titre == "avicultritz")
	      std::cout << titre << " <-> " << line << "(pos1,pos2)=(" << pos1 << "," << pos2 << ")" << std::endl;
	    etylLanguesMap[titre] = std::make_pair(langue, lang2);
	    // is_etyl_incoherent = true;
	  }
	}


	// ne contient que des « ébauche-déf »
	if(line.find("#") != string::npos &&
	   line.find("#*") == string::npos) {
	  if (line.find("{{ébauche-déf") != string::npos)
	    juste_ebauche_def[langue] &= true;
	  else
	    juste_ebauche_def[langue] &= false;	  
	}

	
	is_first_line = false;
	
	if(line.find("</page>")!=string::npos)
	  break;
      }


      ////////////////////////////////////////////////////////
      //
      // On remplit les fichiers de résultats
      // 
      ////////////////////////////////////////////////////////
      
      // début de page incorrect
      if(ns=="0" &&
	 !is_redirect &&
	 is_debut_incorrect)
	outDebutIncorrect << "# [[" << titre << "]]" << endl;

	 
      // noms féminins sans exemple
      if(ns=="0" &&
	 !is_redirect &&
	 titre.find("Titres non pris en charge") == string::npos
	 && f_and_equivpour
	 && !has_example
	 && !is_habitante
	 && is_fr) {

	outMotsFemininsSansExemple << "# [[" << titre << "]]" << endl;
	// cmpt++;
	// if(cmpt%1000 == 0)
	//   cout << cmpt << "\t" << titre << endl;
      }

      // définition commençant par Masculin ou Féminin
      if(ns=="0"
	 && !is_redirect 
	 && titre.find("Titres non pris en charge") == string::npos
	 && is_masculin_or_feminin) {

	outDefinitionMasculinOuFeminin << "# [[" << titre << "]]" << endl;
	// cmpt++;
	// if(cmpt%1000 == 0)
	//   cout << cmpt << "\t" << titre << endl;
      }

      // genre et le contenu d'équiv-pour pas cohérents
      if(ns=="0"
	 && !is_redirect 
	 && titre.find("Titres non pris en charge") == string::npos
	 && equiv_pour_et_genre_pas_coherent) {

	outGenreEquivpourPasCoherent << "# [[" << titre << "]]" << endl;
	// cmpt++;
	// if(cmpt%1000 == 0)
	//   cout << cmpt << "\t" << titre << endl;
      }

      // genre et le contenu d'équiv-pour pas cohérents
      if(ns=="0"
	 && !is_redirect 
	 && titre.find("Titres non pris en charge") == string::npos
	 && trad_pas_fr) {

	outTradHorsSectionFr << "# [[" << titre << "]]" << endl;
	// cmpt++;
	// if(cmpt%1000 == 0)
	//   cout << cmpt << "\t" << titre << endl;
      }

      // genre et le contenu d'équiv-pour pas cohérents
      if(ns=="0"
	 && !is_redirect 
	 && titre.find("Titres non pris en charge") == string::npos
	 && pas_de_genre) {

	outNomsSansGenre << "# [[" << titre << "]]" << endl;
	// cmpt++;
	// if(cmpt%1000 == 0)
	//   cout << cmpt << "\t" << titre << endl;
      }
      
      // ne contient que des « ébauche-déf »
      if(ns=="0"
	 && !is_redirect 
	 && titre.find("Titres non pris en charge") == string::npos) {

	for(const auto& [langue, sans_def] : juste_ebauche_def) {
	  if(langue == "ar"
	     || langue == "ja"
	     || langue == "ko"
	     || langue == "ko-Hani"
	     || langue == "vi"
	     || langue == "vi-chunom"
	     || langue == "zh")
	    continue;

	  if(sans_def) {
	    outSansDefinition << "# [[" << titre << "]] (" << langue << ")" << endl;
	
	
	    // cmpt++;
	    // if(cmpt%1000 == 0)
	    //   cout << cmpt << "\t" << titre << " (" << langue << ")" << endl;
	  }
	  
	}
      } // end ebauche_def

      // dicoado
      if(ns=="0"
	 && !is_redirect 
	 && titre.find("Titres non pris en charge") == string::npos
	 && dicoado_mauvaise_section)
	outDicoado << "# [[" << titre << "]]" << endl;


      // sections vides
      if(ns=="0"
	 && !is_redirect 
	 && titre.find("Titres non pris en charge") == string::npos) {
	int cmpt=0;
	for(int i=0 ; i<sections_vides.size() ; ++i) {
	  if(cmpt==0)
	    outSectionVide << "# [[" << titre << "]] (" << sections_vides_langue.at(i) << "), " << sections_vides.at(i);
	  else
	    outSectionVide << ", " << sections_vides.at(i);

	  cmpt++;
	}
	if(!sections_vides.empty())
	  outSectionVide << std::endl;
      }
      
    }
    // if(cmpt==10) break;
  }
  infile.close();
  outMotsFemininsSansExemple.close();
  outDefinitionMasculinOuFeminin.close();
  outGenreEquivpourPasCoherent.close();
  outTradHorsSectionFr.close();
  outNomsSansGenre.close();
  outSansDefinition.close();
  outDebutIncorrect.close();
  outDicoado.close();
  outSectionVide.close();

  // on vérifie si les mots contenus dans les modèles « équiv-pour »
  // existent déjà comme entrée
  nomFichierSortie = "equi_pour_existe_pas_encore.txt";
  ofstream outEquivpourExistePasEncore(nomFichierSortie.c_str(), ofstream::out);
  if(!outEquivpourExistePasEncore) {
    cout << "Probleme avec le fichier de sortie " << nomFichierSortie << endl;
    return 0;
  }
  
  for(const auto& valeur : valeursEquivPour)
    if (tousLesMots.find(valeur.first) == tousLesMots.end())
      outEquivpourExistePasEncore << "# [[" << valeur.first << "]] ([[" << valeur.second << "]])" << endl;
  
  outEquivpourExistePasEncore.close();

  
  // codes langues incohérents dans étyl
  for (const auto& [titre, langues] : etylLanguesMap) {
    outEtylIncoherent << "# [[" << titre << "]] (" << langues.first << "!=" << langues.second << ")" << endl;
  }
  outEtylIncoherent.close();
 
  cout << cmpt << " articles trouves" << endl;
  return 1;
}