Skip to content
Snippets Groups Projects
Commit fee72e27 authored by Tiphaine Viard's avatar Tiphaine Viard
Browse files

Create subcorpora per theme

parent 03d59795
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,7 @@ import sys
import glob
from pathlib import Path
from nltk.corpus import stopwords
from tqdm import tqdm
LOG_FILE = "corpus.log"
OUT_FILE = "corpus.txt"
......@@ -9,13 +10,18 @@ log_fp = open(LOG_FILE, "w")
corpus_file = open(OUT_FILE, "w", encoding="utf-8")
for fname in glob.glob("./txts/*.txt"):
print(fname)
for fname in tqdm(glob.glob(f"./{sys.argv[1]}/*.txt")):
topics = ["*mapaie"]
try:
f = open(fname, "r")
contents = f.read().strip()
if "fairness" in contents:
topics.append("*fairness")
print("**** *mapaie", file=corpus_file)
print(f.read().strip(), file=corpus_file)
print("**** " + " ".join(topics), file=corpus_file)
print(contents, file=corpus_file)
f.close()
except Exception as e:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment