Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tiphaine.viard/mapaie
1 result
Show changes
Commits on Source (2)
......@@ -19,4 +19,6 @@ source venv/bin/activate
pip install -r requirements.txt
```
Snakemake should be installed on the side.
You can then run `snakemake -c4` to download PDF files and extract their contents. PDF files are stored in `./pdfs`, and textual contents in ` ./txts/`.
......@@ -3,25 +3,49 @@ import sys
import os
from random import choice
from tqdm import tqdm
import csv
requests.packages.urllib3.disable_warnings()
URL_FILE = "list_urls.txt"
MANIFESTOS_FILE = "all_manifestos.csv"
UA_FILE = "user_agents.txt"
OUT_FOLDER = "./docs"
LOG_FILE = "dl_docs.log"
log_fp = open(LOG_FILE, "w")
list_of_urls = [ x.strip() for x in open(URL_FILE).readlines() ]
def csv_to_dict(filepath):
manifestos = {}
with open(filepath, encoding="utf8") as f:
data = csv.reader(f)
headers = next(data)
manifestos_list = []
for d in data:
manifesto = { h: "" for h in headers }
for i,x in enumerate(d):
head = headers[i]
manifesto[head] = x
manifestos_list.append(manifesto)
return manifestos_list
manifestos_list = csv_to_dict(MANIFESTOS_FILE)
list_of_urls = [ x["URL"] for x in manifestos_list if x["Status"].lower() == "included" ]
user_agents = [ x.strip() for x in open(UA_FILE).readlines() ]
# Create output directory if it does not exist
if not os.path.exists(OUT_FOLDER):
os.makedirs(OUT_FOLDER)
for i in tqdm(range(len(list_of_urls))):
url = list_of_urls[i]
f_metadata = open("mapaie-metadata.csv", "w", encoding="utf8")
for i in tqdm(range(len(manifestos_list))):
manifesto = manifestos_list[i]
title = manifesto["Name of the document"]
institution = manifesto["Institution"]
url = manifesto["URL"]
try:
headers = { "User-Agent": choice(user_agents), "Referer": "http://perdu.com" }
response = requests.get(url, headers=headers, timeout=10, verify=False)
......@@ -36,9 +60,11 @@ for i in tqdm(range(len(list_of_urls))):
else:
with open(f"{OUT_FOLDER}/{i}.html", "wb") as f:
f.write(response.content)
f_metadata.write(f"{i}|{title}|{institution}\n")
else:
# if we received any error http code
print(f"ERR: {url},{response.status_code}", file=log_fp)
log_fp.close()
f_metadata.close()