Skip to content
Snippets Groups Projects
Commit 54203cdd authored by Tiphaine Viard's avatar Tiphaine Viard
Browse files

Added metadata file

parent d560be8d
No related branches found
No related tags found
No related merge requests found
......@@ -3,25 +3,49 @@ import sys
import os
from random import choice
from tqdm import tqdm
import csv
requests.packages.urllib3.disable_warnings()
URL_FILE = "list_urls.txt"
MANIFESTOS_FILE = "all_manifestos.csv"
UA_FILE = "user_agents.txt"
OUT_FOLDER = "./docs"
LOG_FILE = "dl_docs.log"
log_fp = open(LOG_FILE, "w")
list_of_urls = [ x.strip() for x in open(URL_FILE).readlines() ]
def csv_to_dict(filepath):
manifestos = {}
with open(filepath, encoding="utf8") as f:
data = csv.reader(f)
headers = next(data)
manifestos_list = []
for d in data:
manifesto = { h: "" for h in headers }
for i,x in enumerate(d):
head = headers[i]
manifesto[head] = x
manifestos_list.append(manifesto)
return manifestos_list
manifestos_list = csv_to_dict(MANIFESTOS_FILE)
list_of_urls = [ x["URL"] for x in manifestos_list if x["Status"].lower() == "included" ]
user_agents = [ x.strip() for x in open(UA_FILE).readlines() ]
# Create output directory if it does not exist
if not os.path.exists(OUT_FOLDER):
os.makedirs(OUT_FOLDER)
for i in tqdm(range(len(list_of_urls))):
url = list_of_urls[i]
f_metadata = open("mapaie-metadata.csv", "w", encoding="utf8")
for i in tqdm(range(len(manifestos_list))):
manifesto = manifestos_list[i]
title = manifesto["Name of the document"]
institution = manifesto["Institution"]
url = manifesto["URL"]
try:
headers = { "User-Agent": choice(user_agents), "Referer": "http://perdu.com" }
response = requests.get(url, headers=headers, timeout=10, verify=False)
......@@ -36,9 +60,11 @@ for i in tqdm(range(len(list_of_urls))):
else:
with open(f"{OUT_FOLDER}/{i}.html", "wb") as f:
f.write(response.content)
f_metadata.write(f"{i}|{title}|{institution}\n")
else:
# if we received any error http code
print(f"ERR: {url},{response.status_code}", file=log_fp)
log_fp.close()
f_metadata.close()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment