Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mapaie
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Tiphaine Viard
mapaie
Commits
b9706918
Commit
b9706918
authored
1 year ago
by
Tiphaine Viard
Browse files
Options
Downloads
Patches
Plain Diff
Updated parser to handle html docs, detect mime types; externalised in parser class
parent
c9f548c3
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Parser.py
+90
-0
90 additions, 0 deletions
Parser.py
parse_docs.py
+15
-23
15 additions, 23 deletions
parse_docs.py
with
105 additions
and
23 deletions
Parser.py
0 → 100644
+
90
−
0
View file @
b9706918
import
sys
from
PyPDF2
import
PdfReader
import
magic
from
bs4
import
BeautifulSoup
from
pathlib
import
Path
class
Parser
:
def
__init__
(
self
,
log_file
=
None
):
self
.
log_file
=
log_file
## To parse HTML files
def
parse_html
(
self
,
fname
):
# Read and parse html
if
"
iso
"
in
magic
.
from_file
(
fname
).
lower
():
charset
=
"
iso-8859-1
"
else
:
charset
=
"
utf-8
"
try
:
f_contents
=
open
(
fname
,
encoding
=
charset
).
read
()
contents
=
BeautifulSoup
(
f_contents
,
features
=
"
html.parser
"
)
except
Exception
as
e
:
print
(
fname
)
print
(
magic
.
from_file
(
fname
))
print
(
e
)
sys
.
exit
(
2
)
all_children
=
list
(
contents
.
children
)
global
MAX_CC
global
THE_CONTENT
global
MAX_DEPTH
MAX_CC
=
0
MAX_DEPTH
=
0
def
call
(
children
,
depth
,
len_content
=
0
):
"""
This function recursively explores all children (ie. performs a depth
first traversal of the DOM tree), and finds the largest textual content
that is not embedded in a script tag.
"""
global
MAX_CC
global
MAX_DEPTH
global
THE_CONTENT
for
child
in
children
:
if
hasattr
(
child
,
"
children
"
)
and
child
.
name
!=
"
script
"
:
# if element has not children, it is a leaf
if
len
(
child
.
text
)
>
MAX_CC
:
MAX_DEPTH
=
depth
+
1
MAX_CC
=
len
(
child
.
text
)
THE_CONTENT
=
{
"
text
"
:
child
.
text
,
"
tag
"
:
child
.
name
}
# call on children elements (ie. go deeper in DOM tree)
call
(
child
.
children
,
depth
+
1
,
len_content
=
len
(
child
.
text
))
# Initial call
call
(
all_children
,
0
)
## Write to file
txt_file
=
open
(
f
"
txts/
{
Path
(
fname
).
stem
}
.txt
"
,
"
w+
"
,
encoding
=
"
utf-8
"
)
print
(
THE_CONTENT
[
"
text
"
],
file
=
txt_file
)
txt_file
.
close
()
return
;
#####
### Parse PDFs
def
parse_pdf
(
self
,
fname
):
try
:
f
=
open
(
fname
,
"
rb
"
)
reader
=
PdfReader
(
f
)
words
=
set
()
txt_file
=
open
(
f
"
txts/
{
Path
(
fname
).
stem
}
.txt
"
,
"
w+
"
,
encoding
=
"
utf-8
"
)
for
page
in
reader
.
pages
:
page_contents
=
page
.
extract_text
()
page_contents
=
page_contents
.
replace
(
"
-
\n
"
,
""
)
page_contents
=
page_contents
.
replace
(
"
\n
"
,
"
"
)
print
(
page_contents
,
file
=
txt_file
)
words
=
words
.
union
(
set
(
page_contents
.
split
(
"
"
)))
f
.
close
()
txt_file
.
close
()
print
(
fname
,
len
(
words
),
file
=
self
.
log_file
)
except
Exception
as
e
:
print
(
f
"
Err
{
fname
}
:
{
e
}
"
,
file
=
self
.
log_file
)
pass
#### END PARSER CODE
This diff is collapsed.
Click to expand it.
parse_docs.py
+
15
−
23
View file @
b9706918
from
PyPDF2
import
PdfReader
import
sys
import
glob
import
os
from
pathlib
import
Path
from
tqdm
import
tqdm
import
magic
from
Parser
import
Parser
LOG_FILE
=
"
parse.log
"
OUT_FOLDER
=
"
./txts
"
log_fp
=
open
(
LOG_FILE
,
"
w
"
)
p
=
Parser
(
log_file
=
log_fp
)
# Create output directory if it does not exist
if
not
os
.
path
.
exists
(
OUT_FOLDER
):
os
.
makedirs
(
OUT_FOLDER
)
all_files
=
[
f
for
f
in
glob
.
glob
(
"
./
pdfs/*.pdf
"
)]
all_files
=
[
f
for
f
in
glob
.
glob
(
"
./
docs/*
"
)]
for
i
in
tqdm
(
range
(
len
(
all_files
))):
fname
=
all_files
[
i
]
ftype
=
magic
.
from_file
(
fname
,
mime
=
True
)
try
:
f
=
open
(
fname
,
"
rb
"
)
reader
=
PdfReader
(
f
)
words
=
set
()
txt_file
=
open
(
f
"
txts/
{
Path
(
fname
).
stem
}
.txt
"
,
"
w+
"
,
encoding
=
"
utf-8
"
)
for
page
in
reader
.
pages
:
page_contents
=
page
.
extract_text
()
page_contents
=
page_contents
.
replace
(
"
-
\n
"
,
""
)
page_contents
=
page_contents
.
replace
(
"
\n
"
,
"
"
)
print
(
page_contents
,
file
=
txt_file
)
words
=
words
.
union
(
set
(
page_contents
.
split
(
"
"
)))
f
.
close
()
txt_file
.
close
()
print
(
fname
,
len
(
words
),
file
=
log_fp
)
except
Exception
as
e
:
print
(
f
"
Err
{
fname
}
:
{
e
}
"
,
file
=
log_fp
)
pass
if
ftype
==
"
text/html
"
or
ftype
==
"
text/xml
"
:
# this is a html file
p
.
parse_html
(
fname
)
elif
ftype
==
"
application/pdf
"
:
# this is a pdf file
p
.
parse_pdf
(
fname
)
else
:
print
(
f
"
ERR. NOT A RECOGNIZED FILETYPE:
{
fname
}
,
{
ftype
}
.
"
,
file
=
log_fp
)
log_fp
.
close
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment