Compare revisions

Tiphaine Viard · Tiphaine Viard · f3ff7f28 · f3ff7f28
--- a/README.md
+++ b/README.md
@@ -19,4 +19,6 @@ source venv/bin/activate
 pip install -r requirements.txt
 ```
+Snakemake should be installed on the side.
 You can then run `snakemake -c4` to download PDF files and extract their contents. PDF files are stored in `./pdfs`, and textual contents in ` ./txts/`.
--- a/dl_docs.py
+++ b/dl_docs.py
@@ -3,25 +3,49 @@ import sys
 import os
 from random import choice
 from tqdm import tqdm
+import csv
 requests.packages.urllib3.disable_warnings()
 URL_FILE = "list_urls.txt"
+MANIFESTOS_FILE = "all_manifestos.csv"
 UA_FILE = "user_agents.txt"
 OUT_FOLDER = "./docs"
 LOG_FILE = "dl_docs.log"
 log_fp = open(LOG_FILE, "w")
-list_of_urls = [ x.strip() for x in open(URL_FILE).readlines() ]
+def csv_to_dict(filepath):
+    manifestos = {}
+    with open(filepath, encoding="utf8") as f:
+        data = csv.reader(f)
+        headers = next(data)
+        manifestos_list = []
+        for d in data:
+            manifesto = { h: "" for h in headers }
+            for i,x in enumerate(d):
+                head = headers[i]
+                manifesto[head] = x
+            manifestos_list.append(manifesto)
+    return manifestos_list
+manifestos_list = csv_to_dict(MANIFESTOS_FILE)
+list_of_urls = [ x["URL"] for x in manifestos_list if x["Status"].lower() == "included" ]
 user_agents = [ x.strip() for x in open(UA_FILE).readlines() ] 
 # Create output directory if it does not exist
 if not os.path.exists(OUT_FOLDER):
    os.makedirs(OUT_FOLDER)
-for i in tqdm(range(len(list_of_urls))):
+f_metadata = open("mapaie-metadata.csv", "w", encoding="utf8")
-    url = list_of_urls[i]
+for i in tqdm(range(len(manifestos_list))):
+    manifesto = manifestos_list[i]
+    title = manifesto["Name of the document"]
+    institution = manifesto["Institution"]
+    url = manifesto["URL"]
    try:
        headers = { "User-Agent": choice(user_agents), "Referer": "http://perdu.com" }
        response = requests.get(url, headers=headers, timeout=10, verify=False)
@@ -36,9 +60,11 @@ for i in tqdm(range(len(list_of_urls))):
        else:
            with open(f"{OUT_FOLDER}/{i}.html", "wb") as f:
                f.write(response.content)
+        f_metadata.write(f"{i}|{title}|{institution}\n")
    else:
        # if we received any error http code
        print(f"ERR: {url},{response.status_code}", file=log_fp)
 log_fp.close()
+f_metadata.close()
No results found