Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tiphaine.viard/mapaie
1 result
Show changes
Commits on Source (2)
...@@ -19,4 +19,6 @@ source venv/bin/activate ...@@ -19,4 +19,6 @@ source venv/bin/activate
pip install -r requirements.txt pip install -r requirements.txt
``` ```
Snakemake should be installed on the side.
You can then run `snakemake -c4` to download PDF files and extract their contents. PDF files are stored in `./pdfs`, and textual contents in ` ./txts/`. You can then run `snakemake -c4` to download PDF files and extract their contents. PDF files are stored in `./pdfs`, and textual contents in ` ./txts/`.
...@@ -3,25 +3,49 @@ import sys ...@@ -3,25 +3,49 @@ import sys
import os import os
from random import choice from random import choice
from tqdm import tqdm from tqdm import tqdm
import csv
requests.packages.urllib3.disable_warnings() requests.packages.urllib3.disable_warnings()
URL_FILE = "list_urls.txt" URL_FILE = "list_urls.txt"
MANIFESTOS_FILE = "all_manifestos.csv"
UA_FILE = "user_agents.txt" UA_FILE = "user_agents.txt"
OUT_FOLDER = "./docs" OUT_FOLDER = "./docs"
LOG_FILE = "dl_docs.log" LOG_FILE = "dl_docs.log"
log_fp = open(LOG_FILE, "w") log_fp = open(LOG_FILE, "w")
list_of_urls = [ x.strip() for x in open(URL_FILE).readlines() ]
def csv_to_dict(filepath):
manifestos = {}
with open(filepath, encoding="utf8") as f:
data = csv.reader(f)
headers = next(data)
manifestos_list = []
for d in data:
manifesto = { h: "" for h in headers }
for i,x in enumerate(d):
head = headers[i]
manifesto[head] = x
manifestos_list.append(manifesto)
return manifestos_list
manifestos_list = csv_to_dict(MANIFESTOS_FILE)
list_of_urls = [ x["URL"] for x in manifestos_list if x["Status"].lower() == "included" ]
user_agents = [ x.strip() for x in open(UA_FILE).readlines() ] user_agents = [ x.strip() for x in open(UA_FILE).readlines() ]
# Create output directory if it does not exist # Create output directory if it does not exist
if not os.path.exists(OUT_FOLDER): if not os.path.exists(OUT_FOLDER):
os.makedirs(OUT_FOLDER) os.makedirs(OUT_FOLDER)
for i in tqdm(range(len(list_of_urls))): f_metadata = open("mapaie-metadata.csv", "w", encoding="utf8")
url = list_of_urls[i]
for i in tqdm(range(len(manifestos_list))):
manifesto = manifestos_list[i]
title = manifesto["Name of the document"]
institution = manifesto["Institution"]
url = manifesto["URL"]
try: try:
headers = { "User-Agent": choice(user_agents), "Referer": "http://perdu.com" } headers = { "User-Agent": choice(user_agents), "Referer": "http://perdu.com" }
response = requests.get(url, headers=headers, timeout=10, verify=False) response = requests.get(url, headers=headers, timeout=10, verify=False)
...@@ -36,9 +60,11 @@ for i in tqdm(range(len(list_of_urls))): ...@@ -36,9 +60,11 @@ for i in tqdm(range(len(list_of_urls))):
else: else:
with open(f"{OUT_FOLDER}/{i}.html", "wb") as f: with open(f"{OUT_FOLDER}/{i}.html", "wb") as f:
f.write(response.content) f.write(response.content)
f_metadata.write(f"{i}|{title}|{institution}\n")
else: else:
# if we received any error http code # if we received any error http code
print(f"ERR: {url},{response.status_code}", file=log_fp) print(f"ERR: {url},{response.status_code}", file=log_fp)
log_fp.close() log_fp.close()
f_metadata.close()