diff --git a/2_EDA/01_top_demanded_skills.sql b/2_EDA/01_top_demanded_skills.sql new file mode 100644 index 0000000..8e16508 --- /dev/null +++ b/2_EDA/01_top_demanded_skills.sql @@ -0,0 +1,128 @@ +/* +Data table prep work + +*/ + +SELECT + * +FROM job_postings_fact as jpf +LIMIT 10; + +SELECT + * +FROM skills_dim as sd +LIMIT 10; + +SELECT + * +FROM skills_job_dim as sjd +LIMIT 10; + +SELECT + * +FROM information_schema.columns +WHERE table_catalog = 'data_jobs' +; + +SELECT + * +FROM information_schema.columns +WHERE table_catalog = 'data_jobs' +AND +column_name LIKE '%id%' +AND table_name IN ('skills_dim', 'job_postings_fact', 'skills_job_dim') +; + +/* +Question: What are the most in-demand skills for data engineers? +- Join job postings to inner join table similar to query 2 +- Identify the top 10 in-demand skills for data engineers +- Focus on remote job postings +- Why? Retrieves the top 10 skills with the highest demand in the remote job market, +providing insights into the most valuable skills for data engineers seeking remote work + +*/ + +SELECT + * +FROM job_postings_fact +LIMIT 10; + + +SELECT + * +FROM skills_job_dim +LIMIT 10; + +SELECT * +FROM skills_dim +LIMIT 10; + + +SELECT + DISTINCT (job_work_from_home) +FROM + job_postings_fact +WHERE + job_title_short LIKE '%Data%' +LIMIT 10 +; + + +SELECT + sd.skills, + COUNT(jpf.*) as demand_skills +FROM job_postings_fact as jpf +INNER JOIN skills_job_dim as sjd + ON jpf.job_id = sjd.job_id +INNER JOIN skills_dim as sd + ON sjd.skill_id = sd.skill_id +WHERE + jpf.job_title_short LIKE 'Data Engineer' + AND + jpf.job_work_from_home = True + +GROUP BY sd.skills +ORDER BY + demand_skills DESC +LIMIT 10 +; + +/* +Data Engineering Skills — Market Summary +Work-From-Home Demand Analysis + +Summary + +Analysis of 95,293 skill mentions across data engineering job postings shows a clear hierarchy: foundational languages (SQL, Python) dominate demand, followed by cloud platforms and big data tooling. Roles offering work-from-home flexibility consistently favour cloud-native skills, as these eliminate any dependency on physical infrastructure and enable fully remote workflows. +Key Findings + +SQL (29,221) and Python (28,776) are the top two skills, making up nearly 60% of total demand — both are essential for any data engineering role. +Cloud platforms (AWS, Azure, GCP) collectively account for ~40% of demand and are strongly correlated with work-from-home eligibility, as all tooling is browser/API-accessible with no on-site infrastructure needed. +Big data and orchestration tools — Spark, Airflow, Snowflake, and Databricks — dominate the mid-tier, signalling that remote roles increasingly expect autonomous pipeline management. +Java remains relevant at #9 (7,267 mentions), primarily for JVM-based systems like Kafka and legacy Spark environments. + + ┌────────────┬───────────────┐ + │ skills │ demand_skills │ + │ varchar │ int64 │ + ├────────────┼───────────────┤ + │ sql │ 29221 │ + │ python │ 28776 │ + │ aws │ 17823 │ + │ azure │ 14143 │ + │ spark │ 12799 │ + │ airflow │ 9996 │ + │ snowflake │ 8639 │ + │ databricks │ 8183 │ + │ java │ 7267 │ + │ gcp │ 6446 │ + ├────────────┴───────────────┤ + │ 10 rows 2 columns │ + └────────────────────────────┘ + +*/ + + + + + diff --git a/2_EDA/02_top_paying_jobs.sql b/2_EDA/02_top_paying_jobs.sql new file mode 100644 index 0000000..d3becb5 --- /dev/null +++ b/2_EDA/02_top_paying_jobs.sql @@ -0,0 +1,30 @@ +SELECT + * +FROM information_schema.columns +WHERE + table_catalog = 'data_jobs' +; + + +SELECT + sd.skills, + ROUND(MEDIAN(jpf.salary_year_avg)) as median_salary, + COUNT(jpf.*) as demand_skills +FROM job_postings_fact as jpf +INNER JOIN skills_job_dim as sjd + ON jpf.job_id = sjd.job_id +INNER JOIN skills_dim as sd + ON sjd.skill_id = sd.skill_id +WHERE + jpf.job_title_short LIKE 'Data Engineer' + AND + jpf.job_work_from_home = True + +GROUP BY sd.skills +HAVING COUNT(jpf.*) > 100 +ORDER BY + median_salary DESC +LIMIT 25 +; + + diff --git a/2_EDA/03_optimal_skills.sql b/2_EDA/03_optimal_skills.sql new file mode 100644 index 0000000..8d3256e --- /dev/null +++ b/2_EDA/03_optimal_skills.sql @@ -0,0 +1,174 @@ +SELECT + sd.skills, + ROUND(MEDIAN(jpf.salary_year_avg)) as median_salary, + COUNT(jpf.*) as demand_skills, + COUNT(jpf.salary_year_avg) as corrected_demand_count +FROM job_postings_fact as jpf +INNER JOIN skills_job_dim as sjd + ON jpf.job_id = sjd.job_id +INNER JOIN skills_dim as sd + ON sjd.skill_id = sd.skill_id +WHERE + jpf.job_title_short LIKE 'Data Engineer' + AND + jpf.job_work_from_home = True + +GROUP BY sd.skills +HAVING COUNT(jpf.*) > 100 +ORDER BY + median_salary DESC +LIMIT 25 +; + + +/* + +┌────────────┬───────────────┬───────────────┬────────────────────────┐ +│ skills │ median_salary │ demand_skills │ corrected_demand_count │ +│ varchar │ double │ int64 │ int64 │ +├────────────┼───────────────┼───────────────┼────────────────────────┤ +│ rust │ 210000.0 │ 232 │ 23 │ +│ golang │ 184000.0 │ 912 │ 39 │ +│ terraform │ 184000.0 │ 3248 │ 193 │ +│ spring │ 175500.0 │ 364 │ 33 │ +│ neo4j │ 170000.0 │ 277 │ 11 │ +│ gdpr │ 169616.0 │ 582 │ 22 │ +│ zoom │ 168438.0 │ 127 │ 12 │ +│ graphql │ 167500.0 │ 445 │ 28 │ +│ mongo │ 162250.0 │ 265 │ 14 │ +│ fastapi │ 157500.0 │ 204 │ 3 │ +│ bitbucket │ 155000.0 │ 478 │ 9 │ +│ django │ 155000.0 │ 265 │ 5 │ +│ crystal │ 154224.0 │ 129 │ 3 │ +│ c │ 151500.0 │ 444 │ 23 │ +│ atlassian │ 151500.0 │ 249 │ 9 │ +│ typescript │ 151000.0 │ 388 │ 39 │ +│ kubernetes │ 150500.0 │ 4202 │ 147 │ +│ node │ 150000.0 │ 179 │ 22 │ +│ ruby │ 150000.0 │ 736 │ 48 │ +│ css │ 150000.0 │ 262 │ 13 │ +│ airflow │ 150000.0 │ 9996 │ 386 │ +│ redis │ 149000.0 │ 605 │ 17 │ +│ vmware │ 148798.0 │ 136 │ 2 │ +│ ansible │ 148798.0 │ 475 │ 14 │ +│ jupyter │ 147500.0 │ 400 │ 15 │ +├────────────┴───────────────┴───────────────┴────────────────────────┤ +│ 25 rows 4 columns │ +└─────────────────────────────────────────────────────────────────────┘ + +*/ + + + + +SELECT + sd.skills, + ROUND(MEDIAN(jpf.salary_year_avg)) as median_salary, + COUNT(jpf.*) as demand_count, + ROUND(LN(COUNT(jpf.*)),2) as demand_count, + ROUND((LN(COUNT(jpf.*)) * MEDIAN(jpf.salary_year_avg))/1_000_000,2) as optimal_score +FROM job_postings_fact as jpf +INNER JOIN skills_job_dim as sjd + ON jpf.job_id = sjd.job_id +INNER JOIN skills_dim as sd + ON sjd.skill_id = sd.skill_id +WHERE + jpf.job_title_short LIKE 'Data Engineer' + AND + jpf.job_work_from_home = True + AND + jpf.salary_year_avg IS NOT NULL +GROUP BY sd.skills +HAVING COUNT(jpf.*) > 100 +ORDER BY + optimal_score DESC +LIMIT 25 +; + + +/* +┌────────────┬───────────────┬──────────────┬──────────────┬───────────────┐ +│ skills │ median_salary │ demand_count │ demand_count │ optimal_score │ +│ varchar │ double │ int64 │ double │ double │ +├────────────┼───────────────┼──────────────┼──────────────┼───────────────┤ +│ terraform │ 184000.0 │ 193 │ 5.26 │ 0.97 │ +│ python │ 135000.0 │ 1133 │ 7.03 │ 0.95 │ +│ aws │ 137320.0 │ 783 │ 6.66 │ 0.91 │ +│ sql │ 130000.0 │ 1128 │ 7.03 │ 0.91 │ +│ airflow │ 150000.0 │ 386 │ 5.96 │ 0.89 │ +│ spark │ 140000.0 │ 503 │ 6.22 │ 0.87 │ +│ kafka │ 145000.0 │ 292 │ 5.68 │ 0.82 │ +│ snowflake │ 135500.0 │ 438 │ 6.08 │ 0.82 │ +│ azure │ 128000.0 │ 475 │ 6.16 │ 0.79 │ +│ java │ 135000.0 │ 303 │ 5.71 │ 0.77 │ +│ scala │ 137290.0 │ 247 │ 5.51 │ 0.76 │ +│ git │ 140000.0 │ 208 │ 5.34 │ 0.75 │ +│ kubernetes │ 150500.0 │ 147 │ 4.99 │ 0.75 │ +│ databricks │ 132750.0 │ 266 │ 5.58 │ 0.74 │ +│ redshift │ 130000.0 │ 274 │ 5.61 │ 0.73 │ +│ gcp │ 136000.0 │ 196 │ 5.28 │ 0.72 │ +│ nosql │ 134415.0 │ 193 │ 5.26 │ 0.71 │ +│ hadoop │ 135000.0 │ 198 │ 5.29 │ 0.71 │ +│ pyspark │ 140000.0 │ 152 │ 5.02 │ 0.7 │ +│ mongodb │ 135750.0 │ 136 │ 4.91 │ 0.67 │ +│ docker │ 135000.0 │ 144 │ 4.97 │ 0.67 │ +│ r │ 134775.0 │ 133 │ 4.89 │ 0.66 │ +│ go │ 140000.0 │ 113 │ 4.73 │ 0.66 │ +│ github │ 135000.0 │ 127 │ 4.84 │ 0.65 │ +│ bigquery │ 135000.0 │ 123 │ 4.81 │ 0.65 │ +├────────────┴───────────────┴──────────────┴──────────────┴───────────────┤ +│ 25 rows 5 columns │ +└──────────────────────────────────────────────────────────────────────────┘ + + +Summary +This analysis examines the optimal skills for remote Data Engineer roles by combining salary +and demand into a single composite score — calculated as the log of job postings multiplied by +median salary. This approach rewards skills that are both well-compensated and widely requested, +avoiding the trap of chasing either high pay in niche roles or high volume in lower-paying ones. +The dataset covers 25 skills, each appearing in at least 100 remote job postings with a reported +salary, giving the findings strong statistical grounding. +The results reveal a clear tiering: Terraform, Python, AWS, and SQL occupy the top cluster with +optimal scores between 0.91–0.97, driven by strong salary floors ($130K–$184K) and massive demand. +Below them sits a rich mid-tier — Airflow, Spark, Kafka, Snowflake — where slightly lower demand +is offset by above-average salaries, particularly in streaming and orchestration. The bottom cluster +(Docker, Go, GitHub, BigQuery) still commands solid $135K+ medians but trails on demand volume, +making them valuable secondary skills rather than primary targets for career positioning. + +Key Findings + +Terraform is the highest-value single skill with the top optimal score (0.97) +and the highest median salary of any skill in the dataset at $184,000 — nearly +$50K above the group average. Despite relatively modest demand (193 postings), +its salary premium is so pronounced that it outscores even Python and SQL. +Infrastructure-as-code expertise is rare, commands a significant wage premium, +and is directly suited to remote work since all provisioning is done via CLI and +APIs. +Python and SQL are the volume anchors of the market, each appearing in over 1,100 +remote postings — more than double any other skill — and both scoring 0.91–0.95. +Their median salaries ($135K and $130K respectively) are solid but not exceptional; +their dominance comes from ubiquity. For anyone entering the field, these two skills +represent the lowest-risk, highest-return investment — nearly every role expects them. + +AWS leads the cloud platforms, outscoring Azure and GCP by a notable margin (0.91 vs. +0.79 and 0.72). All three cloud providers sit in the top half of the table, but AWS +uniquely combines strong demand (783 postings) with the highest cloud median salary +($137,320). This reflects AWS's continued dominance in enterprise data infrastructure +and its tight integration with modern data stacks. Azure and GCP remain important but +are stronger as complementary skills. + +Streaming and orchestration tools (Kafka, Airflow, Spark) offer a high salary-to-demand +ratio, clustering between $140K–$150K median salaries with moderate but healthy demand +(292–503 postings each). These are the skills most likely to differentiate a mid-career +engineer, signalling the ability to manage real-time pipelines and complex DAG-based +workflows autonomously — exactly the profile remote-first teams are hiring for. + +Infrastructure and containerisation skills (Kubernetes, Git, Docker) punch above +their demand weight on salary — Kubernetes in particular has the second-highest median +salary in the dataset at $150,500, despite appearing in only 147 postings. This +niche-but-lucrative pattern suggests that DevOps-adjacent data engineers who can +manage containerised workloads command a meaningful premium, even if the absolute +number of such roles is smaller. These are strong specialisation targets for engineers +already solid in Python/SQL/cloud. + +*/ diff --git a/2_EDA/1_1_Project1_EDA_image.png b/2_EDA/1_1_Project1_EDA_image.png new file mode 100644 index 0000000..0b70c9b Binary files /dev/null and b/2_EDA/1_1_Project1_EDA_image.png differ