preprocess.py

import glob
from tqdm import tqdm
import numpy as np
import os

import nltk as nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer


corpus = []
stop_words = set(stopwords.words('english'))
OUT_FOLDER = "preprocessed/"

# Create output directory if it does not exist
if not os.path.exists(OUT_FOLDER):
    os.makedirs(OUT_FOLDER)

print(f'Preprocessing...')
for i, filename in enumerate(tqdm(glob.glob('txts/*.txt'))):
    name = filename.split('/')[1].split('.')[0]
    with open(filename) as f:
        lines = f.read().strip()
        # Tokenize
        tokens = word_tokenize(lines)
        # Remove tokens with length < 3, not a link and not in stop words
        tokens = (' ').join([t.lower() for t in tokens
            if len(t) >= 3 
            and (t.isalpha() or t in "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~")
            and t.lower() not in stop_words 
            and not "http" in t.lower()
        ])

        # ngrams ?

        # Save tokens
        corpus.append(tokens)


# TF-IDF
def tfidf_filter(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    tfidf_values = np.array(X.mean(axis=0))[0]
    median_tfidf = np.quantile(tfidf_values, 0.5)
    mask = tfidf_values > median_tfidf
    words_to_keep = vectorizer.get_feature_names_out()[mask]
    print(type(words_to_keep))
    return words_to_keep

corpus_filt_tfidf = []
# words_to_keep = tfidf_filter(corpus)

for i, d in enumerate(tqdm(corpus)):
    words = d.split()
    # filt_words = [w for w in words if w in words_to_keep]
    # corpus_filt_tfidf.append(filt_words)
    f = open(f"{OUT_FOLDER}/{i}.txt", "w")
    f.write(" ".join(words) + "\n")
    f.close()

# with open('corpus_filt_tfidf.txt', 'w') as f:
#    for d in corpus_filt_tfidf:
#        doc = ' '.join(d)
#        f.write(doc + '\n')