-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
76 lines (60 loc) · 1.73 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Makefile uses `/bin/bash` shell by default
.PHONY: all-dep
# Installing dependencies for development, processing and dashboard
all-dep:
poetry install --with dev,pipeline,webcrawl
.PHONY: min-dep
# Installing dependencies for dashboard only
min-dep:
poetry install
.PHONY: lint
# Verify proper formatting for Python files
lint:
ruff check .
.PHONY: format
# Automatic fix linting erros for all Python files
format:
ruff check --fix .
.PHONY: test
# Run all project test suites
test:
pytest test/
.PHONY: ci
# Run all continuous integration checks
ci: lint test
.PHONY: serve
# Launch a Streamlit dashboard server
serve:
streamlit run Introduction.py
.PHONY: clean
# Remove all processing artifacts, build files and cache files
clean: clean-data
rm -f poetry.lock
rm -rf .ruff_cache/ .pytest_cache/
find . -type d -name '__pycache__' -exec rm -rf {} +
find . -type d -name '.data' -exec rm -rf {} +
.PHONY: clean-data
# Remove previously collected dataframe
clean-data:
rm -f data/dataframe.csv
.PHONY: pipeline
# Run data processing pipeline for webcrawler output
pipeline:
python pipeline/run.py
.PHONY: collect
# Collect data from both web and standalone crawlers
# Before running set `CHROME_DRIVER` environment variable for standalone webcrawls
collect:
cd crawlers/scrapy; for platform in futurelearn skillshare udemy; do \
scrapy crawl "$platform" -o ".data/$platform.json" \
done
if [[ ! -d "data/.data" ]]; then \
mkdir -p "data/.data"
fi
mv "crawlers/scrapy/.data/*" "data/.data/*"
rm -rf "crawlers/scrapy/.data"
for platform in alison coursera edx pluralsight skillshare; do \
python -m "crawlers.standalone.$platform" \
done
mv "crawlers/standalone/.data/*" "data/.data/*"
rm -rf "crawlers/standalone/.data"