Skip to content

Commit 868e2c4

Browse files
committed
VERSION 1.0
0 parents  commit 868e2c4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+3770
-0
lines changed

‎.aws/config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[default]
2+
region=us-east-1

‎.aws/credentials

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[default]
2+
aws_access_key_id=PUT_HERE_YOUR_AWS_ACCESS_KEY_ID
3+
aws_secret_access_key=PUT_HERE_YOUR_AWS_SECRET_ACCESS_KEY

‎.gitignore

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
pip-wheel-metadata/
24+
share/python-wheels/
25+
*.egg-info/
26+
.installed.cfg
27+
*.egg
28+
MANIFEST
29+
30+
# PyInstaller
31+
# Usually these files are written by a python script from a template
32+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33+
*.manifest
34+
*.spec
35+
36+
# Installer logs
37+
pip-log.txt
38+
pip-delete-this-directory.txt
39+
40+
# Unit test / coverage reports
41+
htmlcov/
42+
.tox/
43+
.nox/
44+
.coverage
45+
.coverage.*
46+
.cache
47+
nosetests.xml
48+
coverage.xml
49+
*.cover
50+
*.py,cover
51+
.hypothesis/
52+
.pytest_cache/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
target/
76+
77+
# Jupyter Notebook
78+
.ipynb_checkpoints
79+
80+
# IPython
81+
profile_default/
82+
ipython_config.py
83+
84+
# pyenv
85+
.python-version
86+
87+
# pipenv
88+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
90+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
91+
# install all needed dependencies.
92+
#Pipfile.lock
93+
94+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
95+
__pypackages__/
96+
97+
# Celery stuff
98+
celerybeat-schedule
99+
celerybeat.pid
100+
101+
# SageMath parsed files
102+
*.sage.py
103+
104+
# Environments
105+
.env
106+
.venv
107+
env/
108+
venv/
109+
ENV/
110+
env.bak/
111+
venv.bak/
112+
113+
# Spyder project settings
114+
.spyderproject
115+
.spyproject
116+
117+
# Rope project settings
118+
.ropeproject
119+
120+
# mkdocs documentation
121+
/site
122+
123+
# mypy
124+
.mypy_cache/
125+
.dmypy.json
126+
dmypy.json
127+
128+
# Pyre type checker
129+
.pyre/
130+
131+
# My
132+
skills_scraper.zip
133+
.aws/config
134+
.aws/credentials
135+
.idea/
136+
setup/
137+
temp/
138+
logs/
139+
plugins/
140+
skills_finder_env/
141+
skills_finder_web/data/
142+
skills_scraper/.serverless
143+
skills_scraper/node_modules
144+
skills_scraper/package-lock.json
145+
terraform/.terraform
146+
terraform/terraform.tfstate
147+
terraform/.terraform.lock.hcl
148+
terraform/terraform.tfstate.backup
149+
terraform/.terraform.tfstate.lock.info

‎.pre-commit-config.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
repos:
2+
- repo: https://github.com/psf/black
3+
rev: 21.12b0
4+
hooks:
5+
- id: black
6+
args: [ --line-length=79, ./dags, ./skills_scraper, ./skills_finder_web]
7+
8+
- repo: https://github.com/pre-commit/pre-commit-hooks
9+
rev: v4.0.1
10+
hooks:
11+
- id: check-json
12+
- id: check-yaml
13+
- id: check-added-large-files
14+
- id: end-of-file-fixer
15+
- id: requirements-txt-fixer
16+
- id: trailing-whitespace
17+
- id: debug-statements
18+
language_version: python3
19+
20+
- repo: https://github.com/asottile/pyupgrade
21+
rev: v2.29.1
22+
hooks:
23+
- id: pyupgrade
24+
args: [ --py36-plus]

‎README.md

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# most-wanted-programming-skills-finder
2+
3+
# Table of contents
4+
5+
* [General info](#general-info)
6+
* [Technologies](#technologies)
7+
* [Setup](#setup)
8+
* [Application view](#application-view)
9+
10+
## General info
11+
12+
<details>
13+
<summary>Click here to see general information about application!</summary>
14+
<br>
15+
With this app, you can see what programming skills are most in demand in the
16+
current job market.
17+
18+
</details>
19+
20+
## Technologies
21+
22+
<details>
23+
<summary>Click here to see the technologies used!</summary>
24+
<ul>
25+
<li>Airflow 2.0</li>
26+
<li>AWS EMR 5.25</li>
27+
<li>AWS Lambda</li>
28+
<li>Boto3 1.20.0</li>
29+
<li>Docker 20.10.7</li>
30+
<li>Docker-compose 1.29.2</li>
31+
<li>Django 3.2.5</li>
32+
<li>Hadoop 2.8.5</li>
33+
<li>Pandas 1.3.4</li>
34+
<li>Python 3.8.5</li>
35+
<li>Postgres 12.5</li>
36+
<li>Scrapy 1.7.3</li>
37+
<li>Serverless 2.64.1</li>
38+
<li>Terraform 1.0.3</li>
39+
</ul>
40+
</details>
41+
42+
## Setup
43+
44+
---
45+
### Prepare skills scraper
46+
1. Write your AWS credentials into __/skills_finder_web/.env__ and also __/.aws/credentials__ files.
47+
2. Run this below commands (to execute this commands you must use __Linux__ system!)
48+
```shell
49+
chmod +x create_aws_env.sh
50+
./create_aws_env.sh
51+
```
52+
---
53+
### Run ETL data pipeline
54+
3. After that you can start ETL data pipeline by using Airflow (available on http://127.0.0.1:8080)
55+
```
56+
docker-compose -f .\docker-compose-airflow.yml up --build
57+
```
58+
---
59+
### Run web server
60+
4. Finally, when all dags complete their activities it is time to launch the web app. Just
61+
navigate to /skills_finder_web directory and up another container.
62+
```
63+
docker-compose -f .\docker-compose-web.yml up --build
64+
```
65+
66+
Application will be available on http://127.0.0.1:8000
67+
68+
## Application view
69+
70+
### ETL data pipeline
71+
![data_pipeline_top](https://user-images.githubusercontent.com/57534862/144881540-3e060653-b0d4-4176-bf1c-61633dd2838d.PNG)

‎create_aws_env.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env bash
2+
# shellcheck disable=SC2164
3+
# shellcheck disable=SC2103
4+
5+
cd skills_scraper/
6+
serverless plugin install --name serverless-python-requirements
7+
serverless package
8+
sleep 2
9+
cd ..
10+
cp -r skills_scraper/.serverless/skills_scraper.zip .
11+
12+
cd terraform/
13+
terraform init
14+
sleep 2
15+
terraform apply
16+
sleep 2
17+
18+
cd ..
19+
rm skills_scraper.zip
20+
cd skills_scraper
21+
rm package-lock.json
22+
rm -rf .serverless
23+
rm -rf node_modules
24+
cd ..

‎dags/config.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
All dags config is located here.
3+
"""
4+
5+
# PSL
6+
from os import path
7+
from pathlib import Path
8+
9+
BUCKET_NAME: str = "skills-finder-storage"
10+
11+
TECH_SKILLS_FILE_S3_LOC: str = "data/tech_skills.txt"
12+
TECH_SKILLS_FILE_LOCAL_LOC: str = path.join(
13+
Path(__file__).parent, "data_", "tech_skills.txt"
14+
)
15+
16+
PYSPARK_JOBS_S3_LOC: str = "emr/_pyspark/jobs/skills_analyzer.py"
17+
PYSPARK_JOBS_LOCAL_LOC: str = path.join(
18+
Path(__file__).parent, "emr", "_pyspark", "jobs", "skills_analyzer.py"
19+
)
20+
21+
PYSPARK_JOBS_SUBMIT_CONFIG = [
22+
{
23+
"Name": "skills-analyzer-job",
24+
"ActionOnFailure": "TERMINATE_CLUSTER",
25+
"HadoopJarStep": {
26+
"Properties": [],
27+
"Jar": "command-runner.jar",
28+
"Args": [
29+
"spark-submit",
30+
f"s3://{BUCKET_NAME}/{PYSPARK_JOBS_S3_LOC}",
31+
],
32+
},
33+
},
34+
]
35+
36+
EMR_JOB_FLOW = {
37+
"Name": "skills analyzer from airflow",
38+
"ReleaseLabel": "emr-5.25.0",
39+
"Applications": [{"Name": "Hadoop"}, {"Name": "Spark"}],
40+
"Configurations": [
41+
{
42+
"Classification": "spark-env",
43+
"Configurations": [
44+
{
45+
"Classification": "export",
46+
"Properties": {"PYSPARK_PYTHON": "/usr/bin/python3"},
47+
}
48+
],
49+
}
50+
],
51+
"Instances": {
52+
"InstanceGroups": [
53+
{
54+
"Name": "Master node",
55+
"Market": "SPOT",
56+
"InstanceRole": "MASTER",
57+
"InstanceType": "m4.xlarge",
58+
"InstanceCount": 1,
59+
},
60+
{
61+
"Name": "Core - 2",
62+
"Market": "SPOT",
63+
"InstanceRole": "CORE",
64+
"InstanceType": "m4.xlarge",
65+
"InstanceCount": 2,
66+
},
67+
],
68+
"TerminationProtected": False,
69+
"Ec2KeyName": "skills-finder-key-pair",
70+
"Ec2SubnetId": "subnet-02202364546d50ad9",
71+
"EmrManagedMasterSecurityGroup": "sg-084e3a57ed95b63a7",
72+
"EmrManagedSlaveSecurityGroup": "sg-0cb737d37d30c08b2",
73+
},
74+
"JobFlowRole": "EMR_EC2_DefaultRole",
75+
"ServiceRole": "EMR_DefaultRole",
76+
}

0 commit comments

Comments
 (0)