Skip to content
Snippets Groups Projects
create_corpus.py 708 B
Newer Older
import sys
import glob
from pathlib import Path
from nltk.corpus import stopwords
from tqdm import tqdm

LOG_FILE = "corpus.log"
OUT_FILE = "corpus.txt"
log_fp = open(LOG_FILE, "w")

corpus_file = open(OUT_FILE, "w", encoding="utf-8")


for fname in tqdm(glob.glob(f"./{sys.argv[1]}/*.txt")):
    topics = ["*mapaie"]

        contents = f.read().strip()
        if "fairness" in contents:
            topics.append("*fairness")
        print("**** " + " ".join(topics), file=corpus_file)
        print(contents, file=corpus_file)
        
        f.close()
    except Exception as e:
        print(f"Err {fname}: {e}")
        pass

log_fp.close()
corpus_file.close()