rule lang:
    input:
        "data/txts/",
    output:
        "data/corpus_lang.csv"
    shell:
        "python create_corpus_before_lang.py"

rule corpus_iramuteq:
    input:
        "data/preprocessed/",
    output:
        directory("data/corpus_iramuteq/")
    shell:
        "python create_corpus.py -t themes.json -d data/preprocessed/ -m iramuteq"

rule corpus_cortex:
    input:
        "data/preprocessed/",
    output:
        directory("data/corpus_cortex/")
    shell:
        "python create_corpus.py -t themes.json -d data/preprocessed/ -m cortext"

rule preprocess:
    input:
        "data/txts/",
    output:
        directory("data/preprocessed/")
    shell:
        "python preprocess.py"

rule parse:
    input:
        "data/docs/",
    output:
        directory("data/txts/")
    shell:
        "python parse_docs.py"

rule download:
    output:
        directory("data/docs/")
    shell:
        "python dl_docs.py"

rule clean:
    shell:
        "rm  -rf docs txts preprocessed"