Newer
Older
import sys
import glob
from pathlib import Path
from nltk.corpus import stopwords
LOG_FILE = "corpus.log"
OUT_FILE = "corpus.txt"
log_fp = open(LOG_FILE, "w")
corpus_file = open(OUT_FILE, "w", encoding="utf-8")
for fname in tqdm(glob.glob(f"./{sys.argv[1]}/*.txt")):
topics = ["*mapaie"]
try:
f = open(fname, "r")
contents = f.read().strip()
if "fairness" in contents:
topics.append("*fairness")
print("**** " + " ".join(topics), file=corpus_file)
print(contents, file=corpus_file)
f.close()
except Exception as e:
print(f"Err {fname}: {e}")
pass
log_fp.close()
corpus_file.close()