import os import sys from PyPDF2 import PdfReader import magic from bs4 import BeautifulSoup from pathlib import Path CHARSET = 'UTF-8' DATA_FOLDER = 'data' TXT_FOLDER = os.path.join(DATA_FOLDER, "txts") class Parser: def __init__(self, log_file=None): self.log_file = log_file ## To parse HTML files def parse_html(self, fname): # Read and parse html if "iso" in magic.from_file(fname).lower(): charset ="iso-8859-1" else: charset = "utf-8" try: f_contents = open(fname, encoding=charset).read() contents = BeautifulSoup(f_contents, features="html.parser") except Exception as e: print(fname) print(magic.from_file(fname)) print(e) sys.exit(2) all_children = list(contents.children) global MAX_CC global THE_CONTENT global MAX_DEPTH MAX_CC = 0 MAX_DEPTH = 0 def call(children, depth, len_content=0): """ This function recursively explores all children (ie. performs a depth first traversal of the DOM tree), and finds the largest textual content that is not embedded in a script tag. """ global MAX_CC global MAX_DEPTH global THE_CONTENT for child in children: if hasattr(child, "children") and child.name != "script": # if element has not children, it is a leaf if len(child.text) > MAX_CC: MAX_DEPTH = depth + 1 MAX_CC = len(child.text) THE_CONTENT = { "text": child.text, "tag": child.name } # call on children elements (ie. go deeper in DOM tree) call(child.children, depth + 1, len_content=len(child.text)) # Initial call call(all_children, 0) ## Write to file txt_file = open(os.path.join(TXT_FOLDER, f"{Path(fname).stem}.txt"), "w+", encoding=CHARSET) print(THE_CONTENT["text"], file=txt_file) txt_file.close() return; ##### ### Parse PDFs def parse_pdf(self, fname): try: f = open(fname, "rb") reader = PdfReader(f) words = set() txt_file = open(os.path.join(TXT_FOLDER, f"{Path(fname).stem}.txt"), "w+", encoding=CHARSET) for page in reader.pages: page_contents = page.extract_text() page_contents = page_contents.replace("-\n", "") page_contents = page_contents.replace("\n", " ") print(page_contents, file=txt_file) words = words.union(set(page_contents.split(" "))) f.close() txt_file.close() print(fname, len(words), file=self.log_file) except Exception as e: print(f"Err {fname}: {e}", file=self.log_file) pass #### END PARSER CODE