-
Notifications
You must be signed in to change notification settings - Fork 1
/
dvc.yaml
152 lines (138 loc) · 4.18 KB
/
dvc.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
stages:
scrap-oecd:
cmd: python scripts/scrap_oecd_ai_database.py
deps:
- scripts/scrap_oecd_ai_database.py
outs:
- data/oecd_docs/raw/
- data/oecd_docs/meta.csv
extract-text-oecd:
cmd: python scripts/parse_docs_to_text.py oecd
deps:
- scripts/parse_docs_to_text.py
- data/oecd_docs/raw/
outs:
- data/oecd_docs/texts/
scrap-nesta:
cmd: python scripts/scrap_nesta_ai_database.py
deps:
- scripts/scrap_nesta_ai_database.py
outs:
- data/nesta_ai_governance_docs/raw/
- data/nesta_ai_governance_docs/meta.csv
extract-text-nesta:
cmd: python scripts/parse_docs_to_text.py nesta
deps:
- scripts/parse_docs_to_text.py
- data/nesta_ai_governance_docs/raw/
outs:
- data/nesta_ai_governance_docs/texts/
embedd-legal-docs:
cmd: python scripts/calculate_docs_embeddings.py
deps:
- scripts/calculate_docs_embeddings.py
- data/nesta_ai_governance_docs/texts/
- data/oecd_docs/texts/
outs:
- data/processed/docs-bert-embeddings.joblib
parse-legal-docs:
cmd: python scripts/process_legal_docs_spacy.py
deps:
- scripts/process_legal_docs_spacy.py
- data/nesta_ai_governance_docs/texts/
- data/oecd_docs/texts/
outs:
- data/processed/intermediate/parsed_legal_texts.joblib
merge_metadata:
cmd: python scripts/merge_metadata.py
deps:
- scripts/merge_metadata.py
- data/oecd_docs/meta.csv
- data/nesta_ai_governance_docs/meta.csv
- data/golden_standard/policy_docs_categories.csv
outs:
- data/policy_docs_all/metadata.csv
analyse-deontic-sentences:
cmd: python scripts/analyse_deontics_sentences.py
deps:
- scripts/analyse_deontics_sentences.py
- data/processed/intermediate/parsed_legal_texts.joblib
- data/policy_docs_all/metadata.csv
outs:
- data/processed/deontics.csv
download-s2orc-meta:
cmd: python scripts/download_s2orc_meta.py
deps:
- scripts/download_s2orc_meta.py
- mair/s2orc_links.py
outs:
- data/s2orc/metadata/comp_sci/
filter-ai-s2orc:
cmd: python scripts/filter_ai_s2orc.py
deps:
- scripts/filter_ai_s2orc.py
- data/s2orc/metadata/comp_sci/
outs:
- data/s2orc/ai_papers_meta.csv
download-full-ai-texts-s2orc:
cmd: python scripts/download_full_text_s2orc.py
deps:
- scripts/download_full_text_s2orc.py
- data/s2orc/ai_papers_meta.csv
outs:
- data/s2orc/pdf_parses/ai/
download-arxiv-dump:
cmd: python scripts/download_arxiv_dump.py --download-sources --download-pdfs
deps:
- data/arxiv_dump/arxiv_categories.txt
- mair/arxiv_dump/keywords.py
- scripts/download_arxiv_dump.py
outs:
- data/arxiv_dump/papers
- data/arxiv_dump/search_results.json
- data/arxiv_dump/sources
frozen: true
unpack-sources-arxiv:
cmd: scripts/unpack_sources.sh
deps:
- scripts/unpack_sources.sh
- data/arxiv_dump/sources
outs:
- data/arxiv_dump/unpacked_sources
extract-texts-arxiv:
cmd: python scripts/parse_arxiv_pdfs.py
deps:
- scripts/parse_arxiv_pdfs.py
- data/arxiv_dump/papers
outs:
- data/arxiv_dump/raw_extracted_texts
build-citation-graph:
cmd: python scripts/build_citations_graph.py
deps:
- scripts/build_citations_graph.py
- data/arxiv_dump/unpacked_sources
outs:
- data/arxiv_dump/citations_graph.json
- data/arxiv_dump/semantic_scholar.json
clean-arxiv-texts:
cmd: python scripts/clean_raw_papers_text.py
deps:
- scripts/clean_raw_papers_text.py
- data/arxiv_dump/raw_extracted_texts
outs:
- data/arxiv_dump/cleaned_texts
parse-arxiv-papers:
cmd: python scripts/process_arxiv_spacy.py
deps:
- scripts/process_arxiv_spacy.py
- data/arxiv_dump/cleaned_texts
outs:
- data/processed/intermediate/parsed_arxiv_papers.joblib
extract-affiliations:
cmd: python scripts/extract_affiliations.py
deps:
- data/arxiv_dump/unpacked_sources
- mair/affiliations_extraction.py
- scripts/extract_affiliations.py
outs:
- data/arxiv_dump/affiliations.json