-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_demographics.py
165 lines (130 loc) · 4.28 KB
/
generate_demographics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Databricks notebook source
from databricks.connect import DatabricksSession
from pyspark.sql import Window
from pyspark.sql import functions as F
spark = DatabricksSession.builder.getOrCreate()
# COMMAND ----------
path = "/Volumes/su_data/nhp/population-projections"
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC # Convert source demographics files to parquet
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC ## Demographics
# COMMAND ----------
def create_demographic_parquet(old_projection_name, new_projection_name):
years = [str(y) for y in range(2018, 2044)]
stack_str = ", ".join(f"'{y}', `{y}`" for y in years)
for sex_int, sex_string in [(1, "males"), (2, "females")]:
(
spark.read.csv(
f"{path}/{old_projection_name}/2018 SNPP Population {sex_string}.csv",
header=True,
)
.filter(F.col("AGE_GROUP") != "All ages")
.filter(F.col("AREA_CODE").rlike("^E0[6-9]"))
.withColumn(
"age",
F.when(F.col("AGE_GROUP") == "90 and over", 90).otherwise(
F.col("AGE_GROUP").astype("int")
),
)
.selectExpr(
"AREA_CODE as area_code",
"age",
f"stack({len(years)}, {stack_str}) as (year, value)",
)
.orderBy("age")
.repartition(1)
.write.mode("overwrite")
.partitionBy("area_code")
.parquet(
f"{path}/demographic_data/projection={new_projection_name}/sex={sex_int}"
)
)
# COMMAND ----------
create_demographic_parquet("snpp_2018b_principal_proj", "principal_proj")
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC ## Births
# COMMAND ----------
def create_birth_parquet():
years = [str(y) for y in range(2019, 2044)]
stack_str = ", ".join(f"'{y}', `{y}`" for y in years)
(
spark.read.csv(
f"{path}/snpp_2018b_principal_proj_births/2018 SNPP Births persons.csv",
header=True,
)
.filter(F.col("AGE_GROUP") != "All ages")
.filter(F.col("AREA_CODE").rlike("^E0[6-9]"))
.withColumn(
"age",
F.when(F.col("AGE_GROUP") == "90 and over", 90).otherwise(
F.col("AGE_GROUP").astype("int")
),
)
.selectExpr(
"AREA_CODE as area_code",
"age",
f"stack({len(years)}, {stack_str}) as (year, value)",
)
.orderBy("age")
.repartition(1)
.write.mode("overwrite")
.partitionBy("area_code")
.parquet(f"{path}/birth_data/projection=principal_proj")
)
# COMMAND ----------
create_birth_parquet()
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC # Create demographics for model
# MAGIC
# MAGIC These steps should not exist in this file in the long term, they should go into the actual
# MAGIC data extraction scripts. For now however, adding in some examples of how to use the data.
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC ## Create national demographics file
(
spark.read.parquet(f"{path}/demographic_data")
.filter(F.col("area_code").rlike("^E0[6-9]"))
.groupBy("projection", "sex", "age")
.pivot("year")
.agg(F.sum("value").alias("value"))
.orderBy("projection", "sex", "age")
)
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC # Create provider demographics file
# COMMAND ----------
demographics = spark.read.parquet(f"{path}/demographic_data")
total_window = Window.partitionBy("provider")
BASE_YEAR = 201920
MIN_PCNT = 0.05
df = (
spark.read.table("su_data.nhp.apc")
.filter(F.col("fyear") == BASE_YEAR)
.filter(F.col("resladst_ons").rlike("^E0[6-9]"))
.groupBy("provider", "resladst_ons")
.count()
.withColumn("pcnt", F.col("count") / F.sum("count").over(total_window))
.filter(F.col("pcnt") > MIN_PCNT)
.withColumn("pcnt", F.col("count") / F.sum("count").over(total_window))
.withColumnRenamed("resladst_ons", "area_code")
.join(demographics, "area_code")
.withColumn("value", F.col("value") * F.col("pcnt"))
.groupBy("provider", "age", "sex", "projection")
.pivot("year")
.agg(F.sum("value"))
.orderBy("provider", "age", "sex", "projection")
)
# COMMAND ----------
df
# COMMAND ----------