Compare revisions

Nicolas Allègre · Nicolas Allègre · Nicolas Allègre · Nicolas Allègre · Nicolas Allègre · Nicolas Allègre
--- a/.gitignore
+++ b/.gitignore
+# Pour le projet mapaie
+log/
+data/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/Parser.py
+++ b/Parser.py
+import os
 import sys
 from PyPDF2 import PdfReader
 import magic
 from bs4 import BeautifulSoup
 from pathlib import Path
+CHARSET = 'UTF-8'
+DATA_FOLDER = 'data'
+TXT_FOLDER = os.path.join(DATA_FOLDER, "txts")
 class Parser:
@@ -60,7 +64,7 @@ class Parser:
        call(all_children, 0)
        ## Write to file
-        txt_file = open(f"txts/{Path(fname).stem}.txt", "w+", encoding="utf-8")
+        txt_file = open(os.path.join(TXT_FOLDER, f"{Path(fname).stem}.txt"), "w+", encoding=CHARSET)
        print(THE_CONTENT["text"], file=txt_file)
        txt_file.close()
@@ -72,7 +76,7 @@ class Parser:
            f = open(fname, "rb")
            reader = PdfReader(f)
            words = set()
-            txt_file = open(f"txts/{Path(fname).stem}.txt", "w+", encoding="utf-8")
+            txt_file = open(os.path.join(TXT_FOLDER, f"{Path(fname).stem}.txt"), "w+", encoding=CHARSET)
            for page in reader.pages:
                page_contents = page.extract_text()

--- a/README.md
+++ b/README.md
 # mapaie
+## Getting started
-## Creating a virtual environment
+### Rappel : Creating a virtual environment
-```
+```sh
 pip install virtualenv
 virtualenv venv
 ```
-## Getting started
+### Rappel : Creating a SSH key for GIT
-Clone the repository, and make sure the python package virtualenv is installed.
+1. Création de la clé SSH
+```sh
+ssh-keygen -t ed25519 -C "MS IA <prenom.nom> GITLAB_Telecom-Paris" -f GITLAB_ENST_SSHKey
+```
+2. Dépôt sur le GITLAB
+3. Test de la connexion
+```sh
+ssh -i ..\..\..\GITLAB_ENST_SSHKey -T git@gitlab.enst.fr
 ```
-cd mapaie
+### Getting project environment
+1. Create virtual Python environment
+```sh
+virtualenv venv
 source venv/bin/activate
-pip install -r requirements.txt
+# on Windows
+.\venv\Scripts\activate
+```
+2. Clone the repository
+```sh
+git clone <URL>
+```
+3. Configuring GIT
+```sh
+cd mapaie
+git config --local user.name "Prénom Nom"
+git config --local user.email "prenom.nom@telecom-paris.fr"
+git config --local core.sshCommand "ssh -i C:\\<Path_to_SSHey>\\GITLAB_ENST_SSHKey"
+```
+ - `--local` pour une configuration local d'un dépôt GIT (mapaie/.git/config)
+ - `--global` pour une configuration user de GIT ($HOME/.config/git/config)
+ - `--system` pour une configuration PC de GIT (<Path_to_GIT>/etc/gitconfig)
+4. Install all requirement
+```sh
+python -m pip install -r mapaie\requirements.txt
+python -m nltk.downloader stopwords
+python -m nltk.downloader punkt_tab
 ```
+5. Installation des dépendances externes (voir la doc sur [python-magic](https://pypi.org/project/python-magic/))
+- Windows & Mac
+```sh
+python -m pip install python-magic-bin
+```
+- Linux (Debian/Ubuntu)
+```sh
+sudo apt-get install libmagic1
+```
+## Using
 Snakemake should be installed on the side.
 You can then run `snakemake -c4` to download PDF files and extract their contents. PDF files are stored in `./pdfs`, and textual contents in ` ./txts/`.
+### Manually
+Dans l'ordre : 
+```sh
+python dl_docs.py
+python parse_docs.py
+python preproccess.py
+python create_corpus_before_lang.py
+python create_corpus.py -t themes.json -d data/preprocessed/ -m iramuteq
+python create_corpus.py -t themes.json -d data/preprocessed/ -m cortext
+```
\ No newline at end of file
--- a/create_corpus.py
+++ b/create_corpus.py
@@ -8,9 +8,16 @@ from tqdm import tqdm
 import ujson as json
 import os
-LOG_FILE = "corpus.log"
+DATA_FOLDER = 'data'
-OUT_FILE = "corpus.txt"
+LOG_FOLDER = "log/"
-log_fp = open(LOG_FILE, "w")
+LOG_FILENAME = "corpus.log"
+LOG_FILE = os.path.join(LOG_FOLDER, LOG_FILENAME)
+OUT_FILENAME = "corpus.txt"
+CHARSET = 'UTF-8'
+os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
+log_fp = open(LOG_FILE, "w", encoding=CHARSET)
 # Parse arguments
 parser = argparse.ArgumentParser()
@@ -22,32 +29,31 @@ parser.add_argument("-t", "--themes")
 args = parser.parse_args()
 print(args)
-corpus_file = open(OUT_FILE, "w", encoding="utf-8")
-nb_docs = 0
 # Keywords
-keywords = json.load(open(args.themes))
+keywords = json.load(open(args.themes, encoding=CHARSET))
 filt_crit = lambda x, kw_list: all(x)
 iramuteq = False
 cortext = False
+folder_name = os.path.join(DATA_FOLDER, f'corpus_{args.method}')
 if args.method == "iramuteq":
    iramuteq = True
-    cortext = False
 elif args.method == "cortext":
    cortext = True
-    iramuteq = False
    for t in keywords:
-        if not os.path.exists(t):
+        os.makedirs(os.path.join(folder_name, t), exist_ok=True)
-            os.makedirs(t)
 else:
    iramuteq = True
+    folder_name = os.path.join(DATA_FOLDER, f'corpus_iramuteq')
+os.makedirs(folder_name, exist_ok=True)
+corpus_file = open(os.path.join(folder_name, OUT_FILENAME), "w", encoding=CHARSET)
 # Counting docs per theme
+nb_docs = 0
 doc_counts = { k: 0 for k, v in keywords.items() }
 doc_occurrences = {}
@@ -56,7 +62,7 @@ for i, fname in enumerate(tqdm(glob.glob(f"./{args.data}/*.txt"))):
    doc_occurrences[i] = {}
    try:
-        f = open(fname, "r")
+        f = open(fname, "r", encoding=CHARSET)
        contents = f.read().strip().lower()
        doc_occurrences[i]["contents"] = contents
@@ -67,7 +73,7 @@ for i, fname in enumerate(tqdm(glob.glob(f"./{args.data}/*.txt"))):
        f.close()
    except Exception as e:
-        print(f"Err {fname}: {e}")
+        print(f"Err {fname}: {e}", file=log_fp)
        pass
 # Write out topics
@@ -104,14 +110,16 @@ for i in doc_occurrences:
        for t in topics:
            # Creer dir topics
            if t.strip("*") != "mapaie":
-                file = open(f"{t.strip('*')}/{i}.txt", "w")
+                file = open(os.path.join(folder_name, f"{t.strip('*')}/{i}.txt"), "w", encoding=CHARSET)
                print(doc_occurrences[i]["contents"], file=file) 
-log_fp.close()
-corpus_file.close()
 # regarder aussi les co-occurrences de thèmes
-print("Summary stats")
+print("Summary stats", file=log_fp)
 for k, v in doc_counts.items():
-    print(f"{k}: {v} ({v/nb_docs*100}%)") 
+    tmp = '-'
+    if nb_docs != 0:
+        tmp = v / nb_docs * 100
+    print(f"{k}: {v} ({tmp}%)", file=log_fp) 
+log_fp.close()
+corpus_file.close()
--- a/create_corpus_before_lang.py
+++ b/create_corpus_before_lang.py
+# python -m pip install langdetect
+import os
+import sys
+from langdetect import detect
+from langdetect import detect_langs
+DATA_FOLDER = 'data'
+LOG_FOLDER = 'log/'
+LOG_FILENAME = 'corpus_lang.log'
+LOG_FILE = os.path.join(LOG_FOLDER, LOG_FILENAME)
+OUT_FILENAME = 'corpus_lang.csv'
+OUT_FILE = os.path.join(DATA_FOLDER, OUT_FILENAME)
+CHARSET = 'UTF-8'
+DEFAULT_FOLDER_PREPROCESSED = 'data/txts/'
+os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
+os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
+def normalise_folder(path: str) -> str:
+    """Change le chemin en format UNIX (/) et ajoute le / final si inexistant.
+    :param str path: Chemin de dossier à normaliser
+    :return str: le chemin remis en forme
+    """
+    path = path.replace('\\', '/')
+    if (path[-1] != '/'):
+        path = path + '/'
+    # end if
+    return path
+# end def normalise_folder
+data_folder = DEFAULT_FOLDER_PREPROCESSED
+if len(sys.argv) > 1:
+    data_folder = sys.argv[1]
+data_folder = normalise_folder(data_folder)
+if not os.path.exists(data_folder):
+    print("Dossier inexistant !", file=sys.stderr)
+    sys.exit(-1)
+log_fp = open(LOG_FILE, "w", encoding=CHARSET)
+csv_fp = open(OUT_FILE, "w", encoding=CHARSET)
+# Header :
+print(f"filename;lang;lang_proba", file=csv_fp)
+for root, dirs, files in os.walk(data_folder):
+    for file_name in files:
+        data = None
+        lang_detect = '-'
+        langs_detect = '-'
+        file_path = os.path.join(root, file_name)
+        if os.path.getsize(file_path) <= 2:  # fichier vide
+            print(f"{file_name} : Fichier vide", file=log_fp)
+            print(f"'{file_name}';-;-", file=csv_fp)
+            continue
+        with open(file_path, 'r', encoding=CHARSET) as file:
+            data = file.read()
+            print(file_path, os.path.getsize(file_path), len(data))
+            lang_detect = detect(data)
+            langs_detect = detect_langs(data)
+            print(f"{file_name} : {lang_detect} ({langs_detect})", file=log_fp)
+            print(f"'{file_name}';'{lang_detect}';'{langs_detect}'", file=csv_fp)
+            # end if
+        # end with
+    # end for
+# end for
+csv_fp.close()
+log_fp.close()
--- a/dl_docs.py
+++ b/dl_docs.py
@@ -8,16 +8,23 @@ import csv
 requests.packages.urllib3.disable_warnings()
 URL_FILE = "list_urls.txt"
-MANIFESTOS_FILE = "all_manifestos.csv"
 UA_FILE = "user_agents.txt"
-OUT_FOLDER = "./docs"
+MANIFESTOS_FILE = "all_manifestos.csv"
-LOG_FILE = "dl_docs.log"
+METADATA_FILENAME = "mapaie-metadata.csv"
-log_fp = open(LOG_FILE, "w")
+DATA_FOLDER = 'data'
+LOG_FOLDER = "log/"
+LOG_FILENAME = "dl_docs.log"
+LOG_FILE = os.path.join(LOG_FOLDER, LOG_FILENAME)
+OUT_FOLDER = os.path.join(DATA_FOLDER, "docs")
+METADATA_FILE = os.path.join(DATA_FOLDER, METADATA_FILENAME)
+CHARSET = 'UTF-8'
+os.makedirs(OUT_FOLDER, exist_ok=True)
+os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)  # Create log folder if not exist
 def csv_to_dict(filepath):
    manifestos = {}
-    with open(filepath, encoding="utf8") as f:
+    with open(filepath, encoding=CHARSET) as f:
        data = csv.reader(f)
        headers = next(data)
        manifestos_list = []
@@ -30,15 +37,13 @@ def csv_to_dict(filepath):
            manifestos_list.append(manifesto)
    return manifestos_list
 manifestos_list = csv_to_dict(MANIFESTOS_FILE)
 list_of_urls = [ x["URL"] for x in manifestos_list if x["Status"].lower() == "included" ]
-user_agents = [ x.strip() for x in open(UA_FILE).readlines() ] 
+user_agents = [ x.strip() for x in open(UA_FILE, encoding=CHARSET).readlines() ] 
-# Create output directory if it does not exist
-if not os.path.exists(OUT_FOLDER):
-    os.makedirs(OUT_FOLDER)
-f_metadata = open("mapaie-metadata.csv", "w", encoding="utf8")
+log_fp = open(LOG_FILE, "w", encoding=CHARSET)
+f_metadata = open(METADATA_FILE, "w", encoding=CHARSET)
 for i in tqdm(range(len(manifestos_list))):
    manifesto = manifestos_list[i]
@@ -49,22 +54,22 @@ for i in tqdm(range(len(manifestos_list))):
    try:
        headers = { "User-Agent": choice(user_agents), "Referer": "http://perdu.com" }
        response = requests.get(url, headers=headers, timeout=10, verify=False)
+        if response.status_code == 200:
+            print(f"{url},OK", file=log_fp)
+            if url[-4:] == ".pdf":
+                with open(f"{OUT_FOLDER}/{i}.pdf", "wb") as f:
+                    f.write(response.content)
+            else:
+                with open(f"{OUT_FOLDER}/{i}.html", "wb") as f:
+                    f.write(response.content)
+            f_metadata.write(f"{i}|{title}|{institution}\n")
+        else:
+            # if we received any error http code
+            print(f"ERR: {url},{response.status_code}", file=log_fp)
    except requests.exceptions.RequestException as e:
        print(f"ERR: {url}, {e}", file=log_fp)
-    if response.status_code == 200:
-        print(f"{url},OK", file=log_fp)
-        if url[-4:] == ".pdf":
-            with open(f"{OUT_FOLDER}/{i}.pdf", "wb") as f:
-                f.write(response.content)
-        else:
-            with open(f"{OUT_FOLDER}/{i}.html", "wb") as f:
-                f.write(response.content)
-        f_metadata.write(f"{i}|{title}|{institution}\n")
-    else:
-        # if we received any error http code
-        print(f"ERR: {url},{response.status_code}", file=log_fp)
 log_fp.close()
 f_metadata.close()
--- a/parse_docs.py
+++ b/parse_docs.py
@@ -5,17 +5,22 @@ import magic
 from Parser import Parser
-LOG_FILE = "parse.log"
+DATA_FOLDER = 'data'
-OUT_FOLDER = "./txts"
+LOG_FOLDER = "log/"
-log_fp = open(LOG_FILE, "w")
+LOG_FILENAME = "parse.log"
+LOG_FILE = os.path.join(LOG_FOLDER, LOG_FILENAME)
+OUT_FOLDER = os.path.join(DATA_FOLDER, "txts")
+DOCS_FOLDER = os.path.join(DATA_FOLDER, "docs")
+CHARSET = 'UTF-8'
-p = Parser(log_file=log_fp)
+os.makedirs(OUT_FOLDER, exist_ok=True)
+os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
+log_fp = open(LOG_FILE, "w", encoding=CHARSET)
-# Create output directory if it does not exist
+p = Parser(log_file=log_fp)
-if not os.path.exists(OUT_FOLDER):
-    os.makedirs(OUT_FOLDER)
-all_files = [f for f in glob.glob("./docs/*")]
+all_files = [f for f in glob.glob(f'{DOCS_FOLDER}/*')]
 for i in tqdm(range(len(all_files))):
    fname = all_files[i]

--- a/preprocess.py
+++ b/preprocess.py
@@ -9,26 +9,27 @@ from nltk.corpus import stopwords
 from sklearn.feature_extraction.text import TfidfVectorizer
+DATA_FOLDER = 'data'
+OUT_FOLDER = os.path.join(DATA_FOLDER, "preprocessed")
+TXT_FOLDER = os.path.join(DATA_FOLDER, "txts")
+CHARSET = 'UTF-8'
+os.makedirs(OUT_FOLDER, exist_ok=True)
 corpus = []
 stop_words = set(stopwords.words('english'))
-OUT_FOLDER = "preprocessed/"
-# Create output directory if it does not exist
-if not os.path.exists(OUT_FOLDER):
-    os.makedirs(OUT_FOLDER)
 print(f'Preprocessing...')
-for i, filename in enumerate(tqdm(glob.glob('txts/*.txt'))):
+for i, filename in enumerate(tqdm(glob.glob(f'{TXT_FOLDER}/*.txt'))):
-    name = filename.split('/')[1].split('.')[0]
+    name = os.path.basename(filename).split('.')[0]
-    with open(filename) as f:
+    with open(filename, encoding=CHARSET) as f:
        lines = f.read().strip()
        # Tokenize
        tokens = word_tokenize(lines)
        # Remove tokens with length < 3, not a link and not in stop words
        tokens = (' ').join([t.lower() for t in tokens
            if len(t) >= 3 
-            and (t.isalpha() or t in "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~")
+            and (t.isalpha() or t in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
            and t.lower() not in stop_words 
            and not "http" in t.lower()
        ])
@@ -57,11 +58,11 @@ for i, d in enumerate(tqdm(corpus)):
    words = d.split()
    # filt_words = [w for w in words if w in words_to_keep]
    # corpus_filt_tfidf.append(filt_words)
-    f = open(f"{OUT_FOLDER}/{i}.txt", "w")
+    f = open(f"{OUT_FOLDER}/{i}.txt", "w", encoding=CHARSET)
    f.write(" ".join(words) + "\n")
    f.close()
-# with open('corpus_filt_tfidf.txt', 'w') as f:
+# with open('corpus_filt_tfidf.txt', 'w', encoding=CHARSET) as f:
 #    for d in corpus_filt_tfidf:
 #        doc = ' '.join(d)
 #        f.write(doc + '\n')
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ numpy
 scikit-learn
 bs4
 python-magic
+langdetect
\ No newline at end of file
--- a/utils_loadData.py
+++ b/utils_loadData.py
+import os
+CHARSET = 'UTF-8'
+def loadData(data_folder: str) -> dict[str, list[str]]:
+    """Load text in each file in a dict.
+    :param str data_folder: Folder path where txt files are
+    :return [filename]=str: Dict with key are filename and value file data
+    """
+    corpus: dict[str, list[str]] = {}
+    for root, dirs, files in os.walk(data_folder):
+        for file_name in files:
+            data = None
+            file_path = os.path.join(root, file_name)
+            if os.path.getsize(file_path) <= 2:  # fichier vide
+                print(f"{file_name} : Fichier vide")
+                continue
+            with open(file_path, 'r', encoding=CHARSET) as file:
+                data = file.read()
+                if len(data.strip()) == 0:
+                    print(f"{file_name} : Fichier vide")
+                    continue
+                corpus[file_name] = data
+                # end if
+            # end with
+        # end for
+    # end for
+    return corpus
+# end def loadData
No results found