-
Nicolas Allègre authoredNicolas Allègre authored
Parser.py 3.02 KiB
import os
import sys
from PyPDF2 import PdfReader
import magic
from bs4 import BeautifulSoup
from pathlib import Path
CHARSET = 'UTF-8'
DATA_FOLDER = 'data'
TXT_FOLDER = os.path.join(DATA_FOLDER, "txts")
class Parser:
def __init__(self, log_file=None):
self.log_file = log_file
## To parse HTML files
def parse_html(self, fname):
# Read and parse html
if "iso" in magic.from_file(fname).lower():
charset ="iso-8859-1"
else:
charset = "utf-8"
try:
f_contents = open(fname, encoding=charset).read()
contents = BeautifulSoup(f_contents, features="html.parser")
except Exception as e:
print(fname)
print(magic.from_file(fname))
print(e)
sys.exit(2)
all_children = list(contents.children)
global MAX_CC
global THE_CONTENT
global MAX_DEPTH
MAX_CC = 0
MAX_DEPTH = 0
def call(children, depth, len_content=0):
"""
This function recursively explores all children (ie. performs a depth
first traversal of the DOM tree), and finds the largest textual content
that is not embedded in a script tag.
"""
global MAX_CC
global MAX_DEPTH
global THE_CONTENT
for child in children:
if hasattr(child, "children") and child.name != "script":
# if element has not children, it is a leaf
if len(child.text) > MAX_CC:
MAX_DEPTH = depth + 1
MAX_CC = len(child.text)
THE_CONTENT = { "text": child.text, "tag": child.name }
# call on children elements (ie. go deeper in DOM tree)
call(child.children, depth + 1, len_content=len(child.text))
# Initial call
call(all_children, 0)
## Write to file
txt_file = open(os.path.join(TXT_FOLDER, f"{Path(fname).stem}.txt"), "w+", encoding=CHARSET)
print(THE_CONTENT["text"], file=txt_file)
txt_file.close()
return;
#####
### Parse PDFs
def parse_pdf(self, fname):
try:
f = open(fname, "rb")
reader = PdfReader(f)
words = set()
txt_file = open(os.path.join(TXT_FOLDER, f"{Path(fname).stem}.txt"), "w+", encoding=CHARSET)
for page in reader.pages:
page_contents = page.extract_text()
page_contents = page_contents.replace("-\n", "")
page_contents = page_contents.replace("\n", " ")
print(page_contents, file=txt_file)
words = words.union(set(page_contents.split(" ")))
f.close()
txt_file.close()
print(fname, len(words), file=self.log_file)
except Exception as e:
print(f"Err {fname}: {e}", file=self.log_file)
pass
#### END PARSER CODE