Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mapaie
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Tiphaine Viard
mapaie
Compare revisions
d560be8d9064d88fa4fdbb05c036c928eb0cb0bd to f3ff7f285fd13b7871ca5b7084d4ff88de24dd33
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
tiphaine.viard/mapaie
Select target project
No results found
f3ff7f285fd13b7871ca5b7084d4ff88de24dd33
Select Git revision
Branches
ia717
main
Swap
Target
tiphaine.viard/mapaie
Select target project
tiphaine.viard/mapaie
1 result
d560be8d9064d88fa4fdbb05c036c928eb0cb0bd
Select Git revision
Branches
ia717
main
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (2)
Added metadata file
· 54203cdd
Tiphaine Viard
authored
5 months ago
54203cdd
Updated README
· f3ff7f28
Tiphaine Viard
authored
5 months ago
f3ff7f28
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
README.md
+2
-0
2 additions, 0 deletions
README.md
dl_docs.py
+30
-4
30 additions, 4 deletions
dl_docs.py
with
32 additions
and
4 deletions
README.md
View file @
f3ff7f28
...
...
@@ -19,4 +19,6 @@ source venv/bin/activate
pip install -r requirements.txt
```
Snakemake should be installed on the side.
You can then run
`snakemake -c4`
to download PDF files and extract their contents. PDF files are stored in
`./pdfs`
, and textual contents in
` ./txts/`
.
This diff is collapsed.
Click to expand it.
dl_docs.py
View file @
f3ff7f28
...
...
@@ -3,25 +3,49 @@ import sys
import
os
from
random
import
choice
from
tqdm
import
tqdm
import
csv
requests
.
packages
.
urllib3
.
disable_warnings
()
URL_FILE
=
"
list_urls.txt
"
MANIFESTOS_FILE
=
"
all_manifestos.csv
"
UA_FILE
=
"
user_agents.txt
"
OUT_FOLDER
=
"
./docs
"
LOG_FILE
=
"
dl_docs.log
"
log_fp
=
open
(
LOG_FILE
,
"
w
"
)
list_of_urls
=
[
x
.
strip
()
for
x
in
open
(
URL_FILE
).
readlines
()
]
def
csv_to_dict
(
filepath
):
manifestos
=
{}
with
open
(
filepath
,
encoding
=
"
utf8
"
)
as
f
:
data
=
csv
.
reader
(
f
)
headers
=
next
(
data
)
manifestos_list
=
[]
for
d
in
data
:
manifesto
=
{
h
:
""
for
h
in
headers
}
for
i
,
x
in
enumerate
(
d
):
head
=
headers
[
i
]
manifesto
[
head
]
=
x
manifestos_list
.
append
(
manifesto
)
return
manifestos_list
manifestos_list
=
csv_to_dict
(
MANIFESTOS_FILE
)
list_of_urls
=
[
x
[
"
URL
"
]
for
x
in
manifestos_list
if
x
[
"
Status
"
].
lower
()
==
"
included
"
]
user_agents
=
[
x
.
strip
()
for
x
in
open
(
UA_FILE
).
readlines
()
]
# Create output directory if it does not exist
if
not
os
.
path
.
exists
(
OUT_FOLDER
):
os
.
makedirs
(
OUT_FOLDER
)
for
i
in
tqdm
(
range
(
len
(
list_of_urls
))):
url
=
list_of_urls
[
i
]
f_metadata
=
open
(
"
mapaie-metadata.csv
"
,
"
w
"
,
encoding
=
"
utf8
"
)
for
i
in
tqdm
(
range
(
len
(
manifestos_list
))):
manifesto
=
manifestos_list
[
i
]
title
=
manifesto
[
"
Name of the document
"
]
institution
=
manifesto
[
"
Institution
"
]
url
=
manifesto
[
"
URL
"
]
try
:
headers
=
{
"
User-Agent
"
:
choice
(
user_agents
),
"
Referer
"
:
"
http://perdu.com
"
}
response
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
10
,
verify
=
False
)
...
...
@@ -36,9 +60,11 @@ for i in tqdm(range(len(list_of_urls))):
else
:
with
open
(
f
"
{
OUT_FOLDER
}
/
{
i
}
.html
"
,
"
wb
"
)
as
f
:
f
.
write
(
response
.
content
)
f_metadata
.
write
(
f
"
{
i
}
|
{
title
}
|
{
institution
}
\n
"
)
else
:
# if we received any error http code
print
(
f
"
ERR:
{
url
}
,
{
response
.
status_code
}
"
,
file
=
log_fp
)
log_fp
.
close
()
f_metadata
.
close
()
This diff is collapsed.
Click to expand it.