Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tiphaine.viard/mapaie
1 result
Show changes
Commits on Source (10)
# Pour le projet mapaie
log/
data/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
import os
import sys import sys
from PyPDF2 import PdfReader from PyPDF2 import PdfReader
import magic import magic
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pathlib import Path from pathlib import Path
CHARSET = 'UTF-8'
DATA_FOLDER = 'data'
TXT_FOLDER = os.path.join(DATA_FOLDER, "txts")
class Parser: class Parser:
...@@ -60,7 +64,7 @@ class Parser: ...@@ -60,7 +64,7 @@ class Parser:
call(all_children, 0) call(all_children, 0)
## Write to file ## Write to file
txt_file = open(f"txts/{Path(fname).stem}.txt", "w+", encoding="utf-8") txt_file = open(os.path.join(TXT_FOLDER, f"{Path(fname).stem}.txt"), "w+", encoding=CHARSET)
print(THE_CONTENT["text"], file=txt_file) print(THE_CONTENT["text"], file=txt_file)
txt_file.close() txt_file.close()
...@@ -72,7 +76,7 @@ class Parser: ...@@ -72,7 +76,7 @@ class Parser:
f = open(fname, "rb") f = open(fname, "rb")
reader = PdfReader(f) reader = PdfReader(f)
words = set() words = set()
txt_file = open(f"txts/{Path(fname).stem}.txt", "w+", encoding="utf-8") txt_file = open(os.path.join(TXT_FOLDER, f"{Path(fname).stem}.txt"), "w+", encoding=CHARSET)
for page in reader.pages: for page in reader.pages:
page_contents = page.extract_text() page_contents = page.extract_text()
......
# mapaie # mapaie
## Getting started
## Creating a virtual environment ### Rappel : Creating a virtual environment
``` ```sh
pip install virtualenv pip install virtualenv
virtualenv venv virtualenv venv
``` ```
## Getting started ### Rappel : Creating a SSH key for GIT
Clone the repository, and make sure the python package virtualenv is installed. 1. Création de la clé SSH
```sh
ssh-keygen -t ed25519 -C "MS IA <prenom.nom> GITLAB_Telecom-Paris" -f GITLAB_ENST_SSHKey
```
2. Dépôt sur le GITLAB
3. Test de la connexion
```sh
ssh -i ..\..\..\GITLAB_ENST_SSHKey -T git@gitlab.enst.fr
``` ```
cd mapaie
### Getting project environment
1. Create virtual Python environment
```sh
virtualenv venv
source venv/bin/activate source venv/bin/activate
pip install -r requirements.txt
# on Windows
.\venv\Scripts\activate
```
2. Clone the repository
```sh
git clone <URL>
```
3. Configuring GIT
```sh
cd mapaie
git config --local user.name "Prénom Nom"
git config --local user.email "prenom.nom@telecom-paris.fr"
git config --local core.sshCommand "ssh -i C:\\<Path_to_SSHey>\\GITLAB_ENST_SSHKey"
```
- `--local` pour une configuration local d'un dépôt GIT (mapaie/.git/config)
- `--global` pour une configuration user de GIT ($HOME/.config/git/config)
- `--system` pour une configuration PC de GIT (<Path_to_GIT>/etc/gitconfig)
4. Install all requirement
```sh
python -m pip install -r mapaie\requirements.txt
python -m nltk.downloader stopwords
python -m nltk.downloader punkt_tab
``` ```
5. Installation des dépendances externes (voir la doc sur [python-magic](https://pypi.org/project/python-magic/))
- Windows & Mac
```sh
python -m pip install python-magic-bin
```
- Linux (Debian/Ubuntu)
```sh
sudo apt-get install libmagic1
```
## Using
Snakemake should be installed on the side. Snakemake should be installed on the side.
You can then run `snakemake -c4` to download PDF files and extract their contents. PDF files are stored in `./pdfs`, and textual contents in ` ./txts/`. You can then run `snakemake -c4` to download PDF files and extract their contents. PDF files are stored in `./pdfs`, and textual contents in ` ./txts/`.
### Manually
Dans l'ordre :
```sh
python dl_docs.py
python parse_docs.py
python preproccess.py
python create_corpus_before_lang.py
python create_corpus.py -t themes.json -d data/preprocessed/ -m iramuteq
python create_corpus.py -t themes.json -d data/preprocessed/ -m cortext
```
\ No newline at end of file
...@@ -8,9 +8,16 @@ from tqdm import tqdm ...@@ -8,9 +8,16 @@ from tqdm import tqdm
import ujson as json import ujson as json
import os import os
LOG_FILE = "corpus.log" DATA_FOLDER = 'data'
OUT_FILE = "corpus.txt" LOG_FOLDER = "log/"
log_fp = open(LOG_FILE, "w") LOG_FILENAME = "corpus.log"
LOG_FILE = os.path.join(LOG_FOLDER, LOG_FILENAME)
OUT_FILENAME = "corpus.txt"
CHARSET = 'UTF-8'
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
log_fp = open(LOG_FILE, "w", encoding=CHARSET)
# Parse arguments # Parse arguments
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -22,32 +29,31 @@ parser.add_argument("-t", "--themes") ...@@ -22,32 +29,31 @@ parser.add_argument("-t", "--themes")
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
corpus_file = open(OUT_FILE, "w", encoding="utf-8")
nb_docs = 0
# Keywords # Keywords
keywords = json.load(open(args.themes)) keywords = json.load(open(args.themes, encoding=CHARSET))
filt_crit = lambda x, kw_list: all(x) filt_crit = lambda x, kw_list: all(x)
iramuteq = False iramuteq = False
cortext = False cortext = False
folder_name = os.path.join(DATA_FOLDER, f'corpus_{args.method}')
if args.method == "iramuteq": if args.method == "iramuteq":
iramuteq = True iramuteq = True
cortext = False
elif args.method == "cortext": elif args.method == "cortext":
cortext = True cortext = True
iramuteq = False
for t in keywords: for t in keywords:
if not os.path.exists(t): os.makedirs(os.path.join(folder_name, t), exist_ok=True)
os.makedirs(t)
else: else:
iramuteq = True iramuteq = True
folder_name = os.path.join(DATA_FOLDER, f'corpus_iramuteq')
os.makedirs(folder_name, exist_ok=True)
corpus_file = open(os.path.join(folder_name, OUT_FILENAME), "w", encoding=CHARSET)
# Counting docs per theme # Counting docs per theme
nb_docs = 0
doc_counts = { k: 0 for k, v in keywords.items() } doc_counts = { k: 0 for k, v in keywords.items() }
doc_occurrences = {} doc_occurrences = {}
...@@ -56,7 +62,7 @@ for i, fname in enumerate(tqdm(glob.glob(f"./{args.data}/*.txt"))): ...@@ -56,7 +62,7 @@ for i, fname in enumerate(tqdm(glob.glob(f"./{args.data}/*.txt"))):
doc_occurrences[i] = {} doc_occurrences[i] = {}
try: try:
f = open(fname, "r") f = open(fname, "r", encoding=CHARSET)
contents = f.read().strip().lower() contents = f.read().strip().lower()
doc_occurrences[i]["contents"] = contents doc_occurrences[i]["contents"] = contents
...@@ -67,7 +73,7 @@ for i, fname in enumerate(tqdm(glob.glob(f"./{args.data}/*.txt"))): ...@@ -67,7 +73,7 @@ for i, fname in enumerate(tqdm(glob.glob(f"./{args.data}/*.txt"))):
f.close() f.close()
except Exception as e: except Exception as e:
print(f"Err {fname}: {e}") print(f"Err {fname}: {e}", file=log_fp)
pass pass
# Write out topics # Write out topics
...@@ -104,14 +110,16 @@ for i in doc_occurrences: ...@@ -104,14 +110,16 @@ for i in doc_occurrences:
for t in topics: for t in topics:
# Creer dir topics # Creer dir topics
if t.strip("*") != "mapaie": if t.strip("*") != "mapaie":
file = open(f"{t.strip('*')}/{i}.txt", "w") file = open(os.path.join(folder_name, f"{t.strip('*')}/{i}.txt"), "w", encoding=CHARSET)
print(doc_occurrences[i]["contents"], file=file) print(doc_occurrences[i]["contents"], file=file)
log_fp.close()
corpus_file.close()
# regarder aussi les co-occurrences de thèmes # regarder aussi les co-occurrences de thèmes
print("Summary stats") print("Summary stats", file=log_fp)
for k, v in doc_counts.items(): for k, v in doc_counts.items():
print(f"{k}: {v} ({v/nb_docs*100}%)") tmp = '-'
if nb_docs != 0:
tmp = v / nb_docs * 100
print(f"{k}: {v} ({tmp}%)", file=log_fp)
log_fp.close()
corpus_file.close()
# python -m pip install langdetect
import os
import sys
from langdetect import detect
from langdetect import detect_langs
DATA_FOLDER = 'data'
LOG_FOLDER = 'log/'
LOG_FILENAME = 'corpus_lang.log'
LOG_FILE = os.path.join(LOG_FOLDER, LOG_FILENAME)
OUT_FILENAME = 'corpus_lang.csv'
OUT_FILE = os.path.join(DATA_FOLDER, OUT_FILENAME)
CHARSET = 'UTF-8'
DEFAULT_FOLDER_PREPROCESSED = 'data/txts/'
os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
def normalise_folder(path: str) -> str:
"""Change le chemin en format UNIX (/) et ajoute le / final si inexistant.
:param str path: Chemin de dossier à normaliser
:return str: le chemin remis en forme
"""
path = path.replace('\\', '/')
if (path[-1] != '/'):
path = path + '/'
# end if
return path
# end def normalise_folder
data_folder = DEFAULT_FOLDER_PREPROCESSED
if len(sys.argv) > 1:
data_folder = sys.argv[1]
data_folder = normalise_folder(data_folder)
if not os.path.exists(data_folder):
print("Dossier inexistant !", file=sys.stderr)
sys.exit(-1)
log_fp = open(LOG_FILE, "w", encoding=CHARSET)
csv_fp = open(OUT_FILE, "w", encoding=CHARSET)
# Header :
print(f"filename;lang;lang_proba", file=csv_fp)
for root, dirs, files in os.walk(data_folder):
for file_name in files:
data = None
lang_detect = '-'
langs_detect = '-'
file_path = os.path.join(root, file_name)
if os.path.getsize(file_path) <= 2: # fichier vide
print(f"{file_name} : Fichier vide", file=log_fp)
print(f"'{file_name}';-;-", file=csv_fp)
continue
with open(file_path, 'r', encoding=CHARSET) as file:
data = file.read()
print(file_path, os.path.getsize(file_path), len(data))
lang_detect = detect(data)
langs_detect = detect_langs(data)
print(f"{file_name} : {lang_detect} ({langs_detect})", file=log_fp)
print(f"'{file_name}';'{lang_detect}';'{langs_detect}'", file=csv_fp)
# end if
# end with
# end for
# end for
csv_fp.close()
log_fp.close()
...@@ -8,16 +8,23 @@ import csv ...@@ -8,16 +8,23 @@ import csv
requests.packages.urllib3.disable_warnings() requests.packages.urllib3.disable_warnings()
URL_FILE = "list_urls.txt" URL_FILE = "list_urls.txt"
MANIFESTOS_FILE = "all_manifestos.csv"
UA_FILE = "user_agents.txt" UA_FILE = "user_agents.txt"
OUT_FOLDER = "./docs" MANIFESTOS_FILE = "all_manifestos.csv"
LOG_FILE = "dl_docs.log" METADATA_FILENAME = "mapaie-metadata.csv"
log_fp = open(LOG_FILE, "w") DATA_FOLDER = 'data'
LOG_FOLDER = "log/"
LOG_FILENAME = "dl_docs.log"
LOG_FILE = os.path.join(LOG_FOLDER, LOG_FILENAME)
OUT_FOLDER = os.path.join(DATA_FOLDER, "docs")
METADATA_FILE = os.path.join(DATA_FOLDER, METADATA_FILENAME)
CHARSET = 'UTF-8'
os.makedirs(OUT_FOLDER, exist_ok=True)
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) # Create log folder if not exist
def csv_to_dict(filepath): def csv_to_dict(filepath):
manifestos = {} manifestos = {}
with open(filepath, encoding="utf8") as f: with open(filepath, encoding=CHARSET) as f:
data = csv.reader(f) data = csv.reader(f)
headers = next(data) headers = next(data)
manifestos_list = [] manifestos_list = []
...@@ -30,15 +37,13 @@ def csv_to_dict(filepath): ...@@ -30,15 +37,13 @@ def csv_to_dict(filepath):
manifestos_list.append(manifesto) manifestos_list.append(manifesto)
return manifestos_list return manifestos_list
manifestos_list = csv_to_dict(MANIFESTOS_FILE) manifestos_list = csv_to_dict(MANIFESTOS_FILE)
list_of_urls = [ x["URL"] for x in manifestos_list if x["Status"].lower() == "included" ] list_of_urls = [ x["URL"] for x in manifestos_list if x["Status"].lower() == "included" ]
user_agents = [ x.strip() for x in open(UA_FILE).readlines() ] user_agents = [ x.strip() for x in open(UA_FILE, encoding=CHARSET).readlines() ]
# Create output directory if it does not exist
if not os.path.exists(OUT_FOLDER):
os.makedirs(OUT_FOLDER)
f_metadata = open("mapaie-metadata.csv", "w", encoding="utf8") log_fp = open(LOG_FILE, "w", encoding=CHARSET)
f_metadata = open(METADATA_FILE, "w", encoding=CHARSET)
for i in tqdm(range(len(manifestos_list))): for i in tqdm(range(len(manifestos_list))):
manifesto = manifestos_list[i] manifesto = manifestos_list[i]
...@@ -49,22 +54,22 @@ for i in tqdm(range(len(manifestos_list))): ...@@ -49,22 +54,22 @@ for i in tqdm(range(len(manifestos_list))):
try: try:
headers = { "User-Agent": choice(user_agents), "Referer": "http://perdu.com" } headers = { "User-Agent": choice(user_agents), "Referer": "http://perdu.com" }
response = requests.get(url, headers=headers, timeout=10, verify=False) response = requests.get(url, headers=headers, timeout=10, verify=False)
if response.status_code == 200:
print(f"{url},OK", file=log_fp)
if url[-4:] == ".pdf":
with open(f"{OUT_FOLDER}/{i}.pdf", "wb") as f:
f.write(response.content)
else:
with open(f"{OUT_FOLDER}/{i}.html", "wb") as f:
f.write(response.content)
f_metadata.write(f"{i}|{title}|{institution}\n")
else:
# if we received any error http code
print(f"ERR: {url},{response.status_code}", file=log_fp)
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
print(f"ERR: {url}, {e}", file=log_fp) print(f"ERR: {url}, {e}", file=log_fp)
if response.status_code == 200:
print(f"{url},OK", file=log_fp)
if url[-4:] == ".pdf":
with open(f"{OUT_FOLDER}/{i}.pdf", "wb") as f:
f.write(response.content)
else:
with open(f"{OUT_FOLDER}/{i}.html", "wb") as f:
f.write(response.content)
f_metadata.write(f"{i}|{title}|{institution}\n")
else:
# if we received any error http code
print(f"ERR: {url},{response.status_code}", file=log_fp)
log_fp.close() log_fp.close()
f_metadata.close() f_metadata.close()
...@@ -5,17 +5,22 @@ import magic ...@@ -5,17 +5,22 @@ import magic
from Parser import Parser from Parser import Parser
LOG_FILE = "parse.log" DATA_FOLDER = 'data'
OUT_FOLDER = "./txts" LOG_FOLDER = "log/"
log_fp = open(LOG_FILE, "w") LOG_FILENAME = "parse.log"
LOG_FILE = os.path.join(LOG_FOLDER, LOG_FILENAME)
OUT_FOLDER = os.path.join(DATA_FOLDER, "txts")
DOCS_FOLDER = os.path.join(DATA_FOLDER, "docs")
CHARSET = 'UTF-8'
p = Parser(log_file=log_fp) os.makedirs(OUT_FOLDER, exist_ok=True)
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
log_fp = open(LOG_FILE, "w", encoding=CHARSET)
# Create output directory if it does not exist p = Parser(log_file=log_fp)
if not os.path.exists(OUT_FOLDER):
os.makedirs(OUT_FOLDER)
all_files = [f for f in glob.glob("./docs/*")] all_files = [f for f in glob.glob(f'{DOCS_FOLDER}/*')]
for i in tqdm(range(len(all_files))): for i in tqdm(range(len(all_files))):
fname = all_files[i] fname = all_files[i]
......
...@@ -9,26 +9,27 @@ from nltk.corpus import stopwords ...@@ -9,26 +9,27 @@ from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
DATA_FOLDER = 'data'
OUT_FOLDER = os.path.join(DATA_FOLDER, "preprocessed")
TXT_FOLDER = os.path.join(DATA_FOLDER, "txts")
CHARSET = 'UTF-8'
os.makedirs(OUT_FOLDER, exist_ok=True)
corpus = [] corpus = []
stop_words = set(stopwords.words('english')) stop_words = set(stopwords.words('english'))
OUT_FOLDER = "preprocessed/"
# Create output directory if it does not exist
if not os.path.exists(OUT_FOLDER):
os.makedirs(OUT_FOLDER)
print(f'Preprocessing...') print(f'Preprocessing...')
for i, filename in enumerate(tqdm(glob.glob('txts/*.txt'))): for i, filename in enumerate(tqdm(glob.glob(f'{TXT_FOLDER}/*.txt'))):
name = filename.split('/')[1].split('.')[0] name = os.path.basename(filename).split('.')[0]
with open(filename) as f: with open(filename, encoding=CHARSET) as f:
lines = f.read().strip() lines = f.read().strip()
# Tokenize # Tokenize
tokens = word_tokenize(lines) tokens = word_tokenize(lines)
# Remove tokens with length < 3, not a link and not in stop words # Remove tokens with length < 3, not a link and not in stop words
tokens = (' ').join([t.lower() for t in tokens tokens = (' ').join([t.lower() for t in tokens
if len(t) >= 3 if len(t) >= 3
and (t.isalpha() or t in "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~") and (t.isalpha() or t in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
and t.lower() not in stop_words and t.lower() not in stop_words
and not "http" in t.lower() and not "http" in t.lower()
]) ])
...@@ -57,11 +58,11 @@ for i, d in enumerate(tqdm(corpus)): ...@@ -57,11 +58,11 @@ for i, d in enumerate(tqdm(corpus)):
words = d.split() words = d.split()
# filt_words = [w for w in words if w in words_to_keep] # filt_words = [w for w in words if w in words_to_keep]
# corpus_filt_tfidf.append(filt_words) # corpus_filt_tfidf.append(filt_words)
f = open(f"{OUT_FOLDER}/{i}.txt", "w") f = open(f"{OUT_FOLDER}/{i}.txt", "w", encoding=CHARSET)
f.write(" ".join(words) + "\n") f.write(" ".join(words) + "\n")
f.close() f.close()
# with open('corpus_filt_tfidf.txt', 'w') as f: # with open('corpus_filt_tfidf.txt', 'w', encoding=CHARSET) as f:
# for d in corpus_filt_tfidf: # for d in corpus_filt_tfidf:
# doc = ' '.join(d) # doc = ' '.join(d)
# f.write(doc + '\n') # f.write(doc + '\n')
...@@ -9,3 +9,4 @@ numpy ...@@ -9,3 +9,4 @@ numpy
scikit-learn scikit-learn
bs4 bs4
python-magic python-magic
langdetect
\ No newline at end of file
import os
CHARSET = 'UTF-8'
def loadData(data_folder: str) -> dict[str, list[str]]:
"""Load text in each file in a dict.
:param str data_folder: Folder path where txt files are
:return [filename]=str: Dict with key are filename and value file data
"""
corpus: dict[str, list[str]] = {}
for root, dirs, files in os.walk(data_folder):
for file_name in files:
data = None
file_path = os.path.join(root, file_name)
if os.path.getsize(file_path) <= 2: # fichier vide
print(f"{file_name} : Fichier vide")
continue
with open(file_path, 'r', encoding=CHARSET) as file:
data = file.read()
if len(data.strip()) == 0:
print(f"{file_name} : Fichier vide")
continue
corpus[file_name] = data
# end if
# end with
# end for
# end for
return corpus
# end def loadData