Skip to content
Snippets Groups Projects
Commit b9706918 authored by Tiphaine Viard's avatar Tiphaine Viard
Browse files

Updated parser to handle html docs, detect mime types; externalised in parser class

parent c9f548c3
No related branches found
No related tags found
No related merge requests found
import sys
from PyPDF2 import PdfReader
import magic
from bs4 import BeautifulSoup
from pathlib import Path
class Parser:
def __init__(self, log_file=None):
self.log_file = log_file
## To parse HTML files
def parse_html(self, fname):
# Read and parse html
if "iso" in magic.from_file(fname).lower():
charset ="iso-8859-1"
else:
charset = "utf-8"
try:
f_contents = open(fname, encoding=charset).read()
contents = BeautifulSoup(f_contents, features="html.parser")
except Exception as e:
print(fname)
print(magic.from_file(fname))
print(e)
sys.exit(2)
all_children = list(contents.children)
global MAX_CC
global THE_CONTENT
global MAX_DEPTH
MAX_CC = 0
MAX_DEPTH = 0
def call(children, depth, len_content=0):
"""
This function recursively explores all children (ie. performs a depth
first traversal of the DOM tree), and finds the largest textual content
that is not embedded in a script tag.
"""
global MAX_CC
global MAX_DEPTH
global THE_CONTENT
for child in children:
if hasattr(child, "children") and child.name != "script":
# if element has not children, it is a leaf
if len(child.text) > MAX_CC:
MAX_DEPTH = depth + 1
MAX_CC = len(child.text)
THE_CONTENT = { "text": child.text, "tag": child.name }
# call on children elements (ie. go deeper in DOM tree)
call(child.children, depth + 1, len_content=len(child.text))
# Initial call
call(all_children, 0)
## Write to file
txt_file = open(f"txts/{Path(fname).stem}.txt", "w+", encoding="utf-8")
print(THE_CONTENT["text"], file=txt_file)
txt_file.close()
return;
#####
### Parse PDFs
def parse_pdf(self, fname):
try:
f = open(fname, "rb")
reader = PdfReader(f)
words = set()
txt_file = open(f"txts/{Path(fname).stem}.txt", "w+", encoding="utf-8")
for page in reader.pages:
page_contents = page.extract_text()
page_contents = page_contents.replace("-\n", "")
page_contents = page_contents.replace("\n", " ")
print(page_contents, file=txt_file)
words = words.union(set(page_contents.split(" ")))
f.close()
txt_file.close()
print(fname, len(words), file=self.log_file)
except Exception as e:
print(f"Err {fname}: {e}", file=self.log_file)
pass
#### END PARSER CODE
from PyPDF2 import PdfReader
import sys
import glob
import os
from pathlib import Path
from tqdm import tqdm
import magic
from Parser import Parser
LOG_FILE = "parse.log"
OUT_FOLDER = "./txts"
log_fp = open(LOG_FILE, "w")
p = Parser(log_file=log_fp)
# Create output directory if it does not exist
if not os.path.exists(OUT_FOLDER):
os.makedirs(OUT_FOLDER)
all_files = [f for f in glob.glob("./pdfs/*.pdf")]
all_files = [f for f in glob.glob("./docs/*")]
for i in tqdm(range(len(all_files))):
fname = all_files[i]
ftype = magic.from_file(fname, mime=True)
try:
f = open(fname, "rb")
reader = PdfReader(f)
words = set()
txt_file = open(f"txts/{Path(fname).stem}.txt", "w+", encoding="utf-8")
for page in reader.pages:
page_contents = page.extract_text()
page_contents = page_contents.replace("-\n", "")
page_contents = page_contents.replace("\n", " ")
print(page_contents, file=txt_file)
words = words.union(set(page_contents.split(" ")))
f.close()
txt_file.close()
print(fname, len(words), file=log_fp)
except Exception as e:
print(f"Err {fname}: {e}", file=log_fp)
pass
if ftype == "text/html" or ftype == "text/xml":
# this is a html file
p.parse_html(fname)
elif ftype == "application/pdf":
# this is a pdf file
p.parse_pdf(fname)
else:
print(f"ERR. NOT A RECOGNIZED FILETYPE: {fname}, {ftype}.", file=log_fp)
log_fp.close()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment