Source data: California 2020 Census - California_CA.csv file from https://www.kaggle.com/zusmani/us-census-2020/version/3?select=California_CA.csv
File size: 721 MB
669171 x 401 columns
%pip install -U seaborn
%pip install geopandas
%pip install folium
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import databricks.koalas as ks
from pyspark.sql.functions import regexp_replace, regexp_extract, col
df_pd = pd.read_csv("/dbfs/FileStore/tables/tables/skum_test/ca_2020_census/California_CA.csv", header='infer')
df_pd.head()
FILEID | STUSAB | SUMLEV | GEOVAR | GEOCOMP | CHARITER | CIFSN_x | LOGRECNO | GEOID | GEOCODE | REGION | DIVISION | STATE | STATENS | COUNTY | COUNTYCC | COUNTYNS | COUSUB | COUSUBCC | COUSUBNS | SUBMCD | SUBMCDCC | SUBMCDNS | ESTATE | ESTATECC | ESTATENS | CONCIT | CONCITCC | CONCITNS | PLACE | PLACECC | PLACENS | TRACT | BLKGRP | BLOCK | AIANHH | AIHHTLI | AIANHHFP | AIANHHCC | AIANHHNS | ... | P0040048 | P0040049 | P0040050 | P0040051 | P0040052 | P0040053 | P0040054 | P0040055 | P0040056 | P0040057 | P0040058 | P0040059 | P0040060 | P0040061 | P0040062 | P0040063 | P0040064 | P0040065 | P0040066 | P0040067 | P0040068 | P0040069 | P0040070 | P0040071 | P0040072 | P0040073 | H0010001 | H0010002 | H0010003 | CIFSN_y.1 | P0050001 | P0050002 | P0050003 | P0050004 | P0050005 | P0050006 | P0050007 | P0050008 | P0050009 | P0050010 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | PLST | CA | 50 | 0 | 0 | 0 | 0 | 2 | 0500000US06001 | 6001 | 4 | 9 | 6 | 1779778 | 1.0 | H1 | 1675839.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 39 | 472 | 179 | 22 | 92 | 41 | 43 | 2 | 59 | 5 | 0 | 8 | 14 | 3 | 0 | 1 | 3 | 63 | 28 | 21 | 0 | 7 | 7 | 0 | 5 | 5 | 621958 | 591636 | 30322 | 3 | 53833 | 10130 | 3406 | 397 | 6218 | 109 | 43703 | 17463 | 421 | 25819 |
1 | PLST | CA | 50 | 0 | 0 | 0 | 0 | 3 | 0500000US06003 | 6003 | 4 | 9 | 6 | 1779778 | 3.0 | H1 | 1675840.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1540 | 530 | 1010 | 3 | 52 | 1 | 1 | 0 | 0 | 0 | 51 | 0 | 0 | 51 |
2 | PLST | CA | 50 | 0 | 0 | 0 | 0 | 4 | 0500000US06005 | 6005 | 4 | 9 | 6 | 1779778 | 5.0 | H1 | 1675841.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 2 | 6 | 0 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 3 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 18805 | 15678 | 3127 | 3 | 4311 | 4098 | 4045 | 2 | 51 | 0 | 213 | 0 | 0 | 213 |
3 | PLST | CA | 50 | 0 | 0 | 0 | 0 | 5 | 0500000US06007 | 6007 | 4 | 9 | 6 | 1779778 | 7.0 | H1 | 1675842.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3 | 19 | 5 | 1 | 5 | 1 | 0 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 90133 | 83268 | 6865 | 3 | 4941 | 1449 | 496 | 31 | 902 | 20 | 3492 | 2234 | 0 | 1258 |
4 | PLST | CA | 50 | 0 | 0 | 0 | 0 | 6 | 0500000US06009 | 6009 | 4 | 9 | 6 | 1779778 | 9.0 | H1 | 1675885.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0 | 10 | 2 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 27422 | 18758 | 8664 | 3 | 461 | 311 | 170 | 40 | 101 | 0 | 150 | 0 | 0 | 150 |
5 rows × 401 columns
df_sp=spark.read.format("csv").option("header","true").load("/FileStore/tables/tables/skum_test/ca_2020_census/California_CA.csv")
#df_sp.head()
#df_sp.dtypes
census_df_sp = spark.read.option("inferSchema", True).option("header", True).csv("/FileStore/tables/tables/skum_test/ca_2020_census/California_CA.csv")
#display(census_df_sp)
#display(census_df.summary())
census_df_sp.count()
len(census_df_sp.columns)
import databricks.koalas as ks
census_ks = census_df_sp.to_koalas()
census_ks.head()
FILEID | STUSAB | SUMLEV | GEOVAR | GEOCOMP | CHARITER | CIFSN_x6 | LOGRECNO | GEOID | GEOCODE | REGION | DIVISION | STATE | STATENS | COUNTY | COUNTYCC | COUNTYNS | COUSUB | COUSUBCC | COUSUBNS | SUBMCD | SUBMCDCC | SUBMCDNS | ESTATE | ESTATECC | ESTATENS | CONCIT | CONCITCC | CONCITNS | PLACE | PLACECC | PLACENS | TRACT | BLKGRP | BLOCK | AIANHH | AIHHTLI | AIANHHFP | AIANHHCC | AIANHHNS | AITS | AITSFP | AITSCC | AITSNS | TTRACT | TBLKGRP | ANRC | ANRCCC | ANRCNS | CBSA | MEMI | CSA | METDIV | NECTA | NMEMI | CNECTA | NECTADIV | CBSAPCI | NECTAPCI | UA | UATYPE | UR | CD116 | CD118 | CD119 | CD120 | CD121 | SLDU18 | SLDU22 | SLDU24 | SLDU26 | SLDU28 | SLDL18 | SLDL22 | SLDL24 | SLDL26 | SLDL28 | VTD | VTDI | ZCTA | SDELM | SDSEC | SDUNI | PUMA | AREALAND | AREAWATR | BASENAME | NAME | FUNCSTAT | GCUNI | POP100 | HU100 | INTPTLAT | INTPTLON | LSADC | PARTFLAG | UGA | CIFSN_y97 | P0010001 | P0010002 | P0010003 | P0010004 | P0010005 | P0010006 | P0010007 | P0010008 | P0010009 | P0010010 | P0010011 | P0010012 | P0010013 | P0010014 | P0010015 | P0010016 | P0010017 | P0010018 | P0010019 | P0010020 | P0010021 | P0010022 | P0010023 | P0010024 | P0010025 | P0010026 | P0010027 | P0010028 | P0010029 | P0010030 | P0010031 | P0010032 | P0010033 | P0010034 | P0010035 | P0010036 | P0010037 | P0010038 | P0010039 | P0010040 | P0010041 | P0010042 | P0010043 | P0010044 | P0010045 | P0010046 | P0010047 | P0010048 | P0010049 | P0010050 | P0010051 | P0010052 | P0010053 | P0010054 | P0010055 | P0010056 | P0010057 | P0010058 | P0010059 | P0010060 | P0010061 | P0010062 | P0010063 | P0010064 | P0010065 | P0010066 | P0010067 | P0010068 | P0010069 | P0010070 | P0010071 | P0020001 | P0020002 | P0020003 | P0020004 | P0020005 | P0020006 | P0020007 | P0020008 | P0020009 | P0020010 | P0020011 | P0020012 | P0020013 | P0020014 | P0020015 | P0020016 | P0020017 | P0020018 | P0020019 | P0020020 | P0020021 | P0020022 | P0020023 | P0020024 | P0020025 | P0020026 | P0020027 | P0020028 | P0020029 | P0020030 | P0020031 | P0020032 | P0020033 | P0020034 | P0020035 | P0020036 | P0020037 | P0020038 | P0020039 | P0020040 | P0020041 | P0020042 | P0020043 | P0020044 | P0020045 | P0020046 | P0020047 | P0020048 | P0020049 | P0020050 | P0020051 | P0020052 | P0020053 | P0020054 | P0020055 | P0020056 | P0020057 | P0020058 | P0020059 | P0020060 | P0020061 | P0020062 | P0020063 | P0020064 | P0020065 | P0020066 | P0020067 | P0020068 | P0020069 | P0020070 | P0020071 | P0020072 | P0020073 | CIFSN_x242 | P0030001 | P0030002 | P0030003 | P0030004 | P0030005 | P0030006 | P0030007 | P0030008 | P0030009 | P0030010 | P0030011 | P0030012 | P0030013 | P0030014 | P0030015 | P0030016 | P0030017 | P0030018 | P0030019 | P0030020 | P0030021 | P0030022 | P0030023 | P0030024 | P0030025 | P0030026 | P0030027 | P0030028 | P0030029 | P0030030 | P0030031 | P0030032 | P0030033 | P0030034 | P0030035 | P0030036 | P0030037 | P0030038 | P0030039 | P0030040 | P0030041 | P0030042 | P0030043 | P0030044 | P0030045 | P0030046 | P0030047 | P0030048 | P0030049 | P0030050 | P0030051 | P0030052 | P0030053 | P0030054 | P0030055 | P0030056 | P0030057 | P0030058 | P0030059 | P0030060 | P0030061 | P0030062 | P0030063 | P0030064 | P0030065 | P0030066 | P0030067 | P0030068 | P0030069 | P0030070 | P0030071 | P0040001 | P0040002 | P0040003 | P0040004 | P0040005 | P0040006 | P0040007 | P0040008 | P0040009 | P0040010 | P0040011 | P0040012 | P0040013 | P0040014 | P0040015 | P0040016 | P0040017 | P0040018 | P0040019 | P0040020 | P0040021 | P0040022 | P0040023 | P0040024 | P0040025 | P0040026 | P0040027 | P0040028 | P0040029 | P0040030 | P0040031 | P0040032 | P0040033 | P0040034 | P0040035 | P0040036 | P0040037 | P0040038 | P0040039 | P0040040 | P0040041 | P0040042 | P0040043 | P0040044 | P0040045 | P0040046 | P0040047 | P0040048 | P0040049 | P0040050 | P0040051 | P0040052 | P0040053 | P0040054 | P0040055 | P0040056 | P0040057 | P0040058 | P0040059 | P0040060 | P0040061 | P0040062 | P0040063 | P0040064 | P0040065 | P0040066 | P0040067 | P0040068 | P0040069 | P0040070 | P0040071 | P0040072 | P0040073 | H0010001 | H0010002 | H0010003 | CIFSN_y390 | P0050001 | P0050002 | P0050003 | P0050004 | P0050005 | P0050006 | P0050007 | P0050008 | P0050009 | P0050010 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | PLST | CA | 50 | 00 | 0 | 0 | 0 | 2 | 0500000US06001 | 06001 | 4 | 9 | 6 | 1779778 | 1.0 | H1 | 1675839.0 | NaN | None | NaN | None | None | None | None | None | None | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | None | NaN | None | NaN | NaN | NaN | None | NaN | None | None | NaN | NaN | NaN | 41860.0 | 1.0 | 488.0 | 36084.0 | NaN | NaN | NaN | NaN | None | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | None | None | None | NaN | NaN | NaN | None | 1910017353 | 216902808 | Alameda | Alameda County | A | None | 1682353 | 621958 | 37.647139 | -121.912488 | 06 | None | NaN | 1 | 1682353 | 1491537 | 523836 | 164879 | 19659 | 545261 | 14123 | 223779 | 190816 | 171703 | 13579 | 10768 | 39888 | 2293 | 81119 | 2682 | 4125 | 640 | 4182 | 707 | 151 | 3685 | 3684 | 3285 | 915 | 16711 | 2460 | 1652 | 154 | 1551 | 1066 | 137 | 4502 | 1704 | 2121 | 206 | 176 | 34 | 231 | 205 | 194 | 30 | 64 | 77 | 20 | 127 | 2112 | 404 | 49 | 819 | 110 | 159 | 22 | 124 | 257 | 21 | 105 | 25 | 8 | 0 | 6 | 3 | 262 | 95 | 129 | 14 | 12 | 12 | 0 | 28 | 28 | 1682353 | 393749 | 1288604 | 1200067 | 472277 | 159499 | 4131 | 540511 | 13209 | 10440 | 88537 | 80792 | 11936 | 7033 | 37058 | 1921 | 9261 | 2221 | 3799 | 566 | 1376 | 452 | 92 | 59 | 3343 | 1125 | 550 | 6898 | 1803 | 1394 | 116 | 417 | 710 | 92 | 67 | 1444 | 315 | 15 | 105 | 17 | 71 | 157 | 72 | 13 | 25 | 7 | 3 | 55 | 750 | 300 | 27 | 120 | 99 | 71 | 2 | 78 | 5 | 0 | 27 | 14 | 3 | 0 | 1 | 3 | 92 | 48 | 23 | 7 | 7 | 7 | 0 | 5 | 5 | 2 | 1338388 | 1208791 | 448581 | 134525 | 14919 | 434901 | 10921 | 164944 | 129597 | 117917 | 8444 | 8557 | 21834 | 1557 | 61432 | 2112 | 2214 | 361 | 2852 | 426 | 119 | 2807 | 2882 | 1654 | 666 | 10250 | 1748 | 665 | 75 | 860 | 585 | 87 | 3459 | 1075 | 930 | 108 | 100 | 20 | 172 | 100 | 97 | 4 | 40 | 40 | 9 | 76 | 1246 | 229 | 28 | 585 | 41 | 81 | 5 | 75 | 105 | 12 | 46 | 25 | 7 | 0 | 4 | 3 | 159 | 51 | 85 | 7 | 7 | 9 | 0 | 25 | 25 | 1338388 | 286178 | 1052210 | 994680 | 410177 | 131210 | 3319 | 431949 | 10355 | 7670 | 57530 | 52710 | 7762 | 5969 | 20670 | 1375 | 7303 | 1834 | 2101 | 332 | 1126 | 276 | 83 | 49 | 2660 | 709 | 461 | 4280 | 1369 | 575 | 64 | 279 | 415 | 60 | 58 | 942 | 179 | 4 | 66 | 8 | 63 | 84 | 43 | 1 | 24 | 7 | 0 | 39 | 472 | 179 | 22 | 92 | 41 | 43 | 2 | 59 | 5 | 0 | 8 | 14 | 3 | 0 | 1 | 3 | 63 | 28 | 21 | 0 | 7 | 7 | 0 | 5 | 5 | 621958 | 591636 | 30322 | 3 | 53833 | 10130 | 3406 | 397 | 6218 | 109 | 43703 | 17463 | 421 | 25819 |
1 | PLST | CA | 50 | 00 | 0 | 0 | 0 | 3 | 0500000US06003 | 06003 | 4 | 9 | 6 | 1779778 | 3.0 | H1 | 1675840.0 | NaN | None | NaN | None | None | None | None | None | None | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | None | NaN | None | NaN | NaN | NaN | None | NaN | None | None | NaN | NaN | NaN | 99999.0 | 9.0 | 999.0 | 99999.0 | NaN | NaN | NaN | NaN | None | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | None | None | None | NaN | NaN | NaN | None | 1912292607 | 12557304 | Alpine | Alpine County | A | None | 1204 | 1540 | 38.621783 | -119.798352 | 06 | None | NaN | 1 | 1204 | 1085 | 814 | 10 | 236 | 12 | 0 | 13 | 119 | 89 | 5 | 10 | 6 | 1 | 50 | 5 | 2 | 1 | 0 | 1 | 1 | 3 | 2 | 2 | 0 | 20 | 2 | 0 | 4 | 0 | 0 | 2 | 3 | 0 | 4 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 9 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1204 | 84 | 1120 | 1044 | 801 | 10 | 214 | 12 | 0 | 7 | 76 | 56 | 5 | 10 | 4 | 1 | 25 | 5 | 0 | 1 | 0 | 1 | 0 | 1 | 2 | 1 | 0 | 15 | 2 | 0 | 3 | 0 | 0 | 2 | 0 | 0 | 4 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 1009 | 941 | 736 | 9 | 174 | 11 | 0 | 11 | 68 | 54 | 5 | 1 | 2 | 1 | 40 | 0 | 2 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 8 | 1 | 0 | 1 | 0 | 0 | 0 | 2 | 0 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1009 | 62 | 947 | 915 | 726 | 9 | 162 | 11 | 0 | 7 | 32 | 26 | 5 | 1 | 0 | 1 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1540 | 530 | 1010 | 3 | 52 | 1 | 1 | 0 | 0 | 0 | 51 | 0 | 0 | 51 |
2 | PLST | CA | 50 | 00 | 0 | 0 | 0 | 4 | 0500000US06005 | 06005 | 4 | 9 | 6 | 1779778 | 5.0 | H1 | 1675841.0 | NaN | None | NaN | None | None | None | None | None | None | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | None | NaN | None | NaN | NaN | NaN | None | NaN | None | None | NaN | NaN | NaN | 99999.0 | 9.0 | 999.0 | 99999.0 | NaN | NaN | NaN | NaN | None | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | None | None | None | NaN | NaN | NaN | None | 1539965777 | 29438413 | Amador | Amador County | A | None | 40474 | 18805 | 38.443549 | -120.653858 | 06 | None | NaN | 1 | 40474 | 36596 | 31104 | 1236 | 757 | 582 | 82 | 2835 | 3878 | 3591 | 163 | 1123 | 307 | 59 | 1804 | 20 | 5 | 0 | 6 | 18 | 2 | 15 | 59 | 6 | 4 | 261 | 28 | 7 | 4 | 12 | 34 | 11 | 87 | 26 | 19 | 5 | 2 | 1 | 1 | 8 | 1 | 7 | 0 | 0 | 0 | 8 | 17 | 0 | 2 | 4 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 9 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 40474 | 6014 | 34460 | 32393 | 29725 | 1215 | 577 | 554 | 73 | 249 | 2067 | 1948 | 139 | 977 | 263 | 52 | 417 | 13 | 5 | 0 | 2 | 9 | 2 | 9 | 59 | 1 | 0 | 107 | 24 | 4 | 0 | 6 | 31 | 7 | 2 | 22 | 0 | 0 | 0 | 0 | 0 | 8 | 1 | 0 | 0 | 0 | 0 | 2 | 9 | 0 | 2 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 34042 | 31277 | 26465 | 1193 | 578 | 498 | 60 | 2483 | 2765 | 2617 | 99 | 831 | 185 | 42 | 1353 | 9 | 0 | 0 | 2 | 16 | 0 | 15 | 59 | 6 | 0 | 133 | 13 | 1 | 0 | 6 | 15 | 7 | 56 | 12 | 10 | 0 | 2 | 0 | 1 | 5 | 1 | 0 | 0 | 0 | 0 | 4 | 10 | 0 | 2 | 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 5 | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 34042 | 4550 | 29492 | 27953 | 25572 | 1184 | 450 | 478 | 53 | 216 | 1539 | 1471 | 87 | 755 | 160 | 40 | 343 | 9 | 0 | 0 | 1 | 7 | 0 | 9 | 59 | 1 | 0 | 59 | 13 | 1 | 0 | 6 | 12 | 7 | 0 | 12 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | 0 | 0 | 0 | 0 | 2 | 6 | 0 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 3 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 18805 | 15678 | 3127 | 3 | 4311 | 4098 | 4045 | 2 | 51 | 0 | 213 | 0 | 0 | 213 |
3 | PLST | CA | 50 | 00 | 0 | 0 | 0 | 5 | 0500000US06007 | 06007 | 4 | 9 | 6 | 1779778 | 7.0 | H1 | 1675842.0 | NaN | None | NaN | None | None | None | None | None | None | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | None | NaN | None | NaN | NaN | NaN | None | NaN | None | None | NaN | NaN | NaN | 17020.0 | 1.0 | 999.0 | 99999.0 | NaN | NaN | NaN | NaN | None | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | None | None | None | NaN | NaN | NaN | None | 4238488698 | 105260548 | Butte | Butte County | A | None | 211632 | 90133 | 39.665336 | -121.603209 | 06 | None | NaN | 1 | 211632 | 186947 | 149557 | 3644 | 4492 | 10533 | 573 | 18148 | 24685 | 22849 | 1764 | 6709 | 2396 | 388 | 10402 | 177 | 93 | 24 | 157 | 69 | 22 | 374 | 172 | 84 | 18 | 1652 | 345 | 97 | 16 | 124 | 162 | 61 | 480 | 182 | 87 | 21 | 16 | 1 | 21 | 4 | 7 | 0 | 5 | 16 | 1 | 6 | 156 | 30 | 7 | 41 | 1 | 6 | 7 | 29 | 23 | 2 | 7 | 0 | 2 | 0 | 0 | 1 | 28 | 12 | 11 | 2 | 3 | 0 | 0 | 0 | 0 | 211632 | 40112 | 171520 | 158046 | 139651 | 3320 | 3050 | 10333 | 508 | 1184 | 13474 | 12581 | 1537 | 5837 | 2174 | 348 | 2129 | 127 | 80 | 22 | 55 | 50 | 21 | 21 | 154 | 25 | 1 | 824 | 263 | 87 | 16 | 37 | 122 | 45 | 31 | 170 | 15 | 2 | 6 | 0 | 16 | 4 | 2 | 0 | 5 | 0 | 0 | 3 | 55 | 23 | 2 | 10 | 1 | 0 | 0 | 16 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | 5 | 6 | 0 | 3 | 0 | 0 | 0 | 0 | 2 | 168339 | 151623 | 123122 | 2773 | 3324 | 7963 | 464 | 13977 | 16716 | 15688 | 852 | 4862 | 1444 | 259 | 7450 | 105 | 56 | 13 | 112 | 41 | 19 | 270 | 132 | 65 | 8 | 921 | 181 | 50 | 1 | 59 | 56 | 26 | 325 | 107 | 50 | 18 | 5 | 0 | 16 | 4 | 4 | 0 | 5 | 8 | 0 | 6 | 86 | 7 | 6 | 19 | 1 | 6 | 7 | 20 | 13 | 2 | 3 | 0 | 2 | 0 | 0 | 0 | 21 | 12 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 168339 | 28425 | 139914 | 130683 | 116628 | 2606 | 2317 | 7828 | 416 | 888 | 9231 | 8761 | 776 | 4390 | 1334 | 234 | 1637 | 79 | 50 | 11 | 39 | 31 | 18 | 18 | 120 | 23 | 1 | 443 | 144 | 44 | 1 | 24 | 49 | 26 | 25 | 100 | 4 | 2 | 0 | 0 | 12 | 4 | 0 | 0 | 5 | 0 | 0 | 3 | 19 | 5 | 1 | 5 | 1 | 0 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 90133 | 83268 | 6865 | 3 | 4941 | 1449 | 496 | 31 | 902 | 20 | 3492 | 2234 | 0 | 1258 |
4 | PLST | CA | 50 | 00 | 0 | 0 | 0 | 6 | 0500000US06009 | 06009 | 4 | 9 | 6 | 1779778 | 9.0 | H1 | 1675885.0 | NaN | None | NaN | None | None | None | None | None | None | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | None | NaN | None | NaN | NaN | NaN | None | NaN | None | None | NaN | NaN | NaN | 99999.0 | 9.0 | 999.0 | 99999.0 | NaN | NaN | NaN | NaN | None | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | NaN | None | None | None | None | None | None | None | NaN | NaN | NaN | None | 2641837359 | 43789489 | Calaveras | Calaveras County | A | None | 45292 | 27422 | 38.191068 | -120.554106 | 06 | None | NaN | 1 | 45292 | 40264 | 36315 | 364 | 747 | 743 | 100 | 1995 | 5028 | 4706 | 195 | 1513 | 428 | 110 | 2285 | 13 | 13 | 0 | 24 | 6 | 6 | 43 | 39 | 28 | 3 | 285 | 15 | 8 | 2 | 28 | 25 | 24 | 104 | 39 | 21 | 0 | 8 | 0 | 2 | 0 | 0 | 0 | 0 | 5 | 0 | 4 | 31 | 6 | 1 | 4 | 0 | 7 | 0 | 4 | 0 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 5 | 3 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 45292 | 5865 | 39427 | 36548 | 34668 | 334 | 497 | 706 | 75 | 268 | 2879 | 2748 | 182 | 1370 | 385 | 109 | 618 | 13 | 5 | 0 | 2 | 0 | 6 | 5 | 39 | 11 | 3 | 112 | 15 | 6 | 0 | 10 | 25 | 8 | 8 | 36 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 15 | 3 | 1 | 4 | 0 | 0 | 0 | 4 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 4 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 37337 | 33724 | 30591 | 278 | 613 | 652 | 82 | 1508 | 3613 | 3411 | 105 | 1163 | 237 | 80 | 1694 | 11 | 5 | 0 | 22 | 1 | 6 | 36 | 35 | 13 | 3 | 184 | 11 | 6 | 2 | 20 | 18 | 15 | 67 | 24 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 2 | 16 | 5 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 3 | 3 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 37337 | 4153 | 33184 | 31086 | 29518 | 254 | 422 | 621 | 66 | 205 | 2098 | 2006 | 101 | 1058 | 221 | 80 | 474 | 11 | 5 | 0 | 2 | 0 | 6 | 5 | 35 | 5 | 3 | 81 | 11 | 6 | 0 | 10 | 18 | 4 | 8 | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 10 | 2 | 1 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 27422 | 18758 | 8664 | 3 | 461 | 311 | 170 | 40 | 101 | 0 | 150 | 0 | 0 | 150 |
Koalas is an open-source Python package that implements the pandas API on top of Apache Spark, to make the pandas API scalable to big data.
import databricks.koalas as ks
data_ks = ks.read_csv("/FileStore/tables/tables/skum_test/ca_2020_census/California_CA.csv", header=0)
#display(data_ks)
Pandas | Spark | Spark with inferSchema option | Koalas |
---|---|---|---|
24.44 secs | 7.14 secs | 5.44 secs | 3.22 secs |
#data_ks.info()
data_ks.shape
The following 104 columns of data will be used for analysis:
REGION Region
DIVISION Division
COUNTY County (FIPS)
CBSA Metropolitan Statistical Area/Micropolitan Statistical Area
MEMI Metropolitan/Micropolitan Indicator
CSA Combined Statistical Area
METDIV Metropolitan Division
AREALAND Area (Land)
AREAWATR Area (Water)
BASENAME Area Base Name
NAME Area Name-Legal/Statistical Area Description (LSAD) Term-Part Indicator
FUNCSTAT Functional Status Code
POP100 Population Count (100%)
HU100 Housing Unit Count (100%)
INTPTLAT Internal Point (Latitude)
INTPTLON Internal Point (Longitude)
LSADC Legal/Statistical Area Description Code
P0010001 Total:
P0010002 Population of one race:
P0010003 White alone
P0010004 Black or African American alone
P0010005 American Indian and Alaska Native alone
P0010006 Asian alone
P0010007 Native Hawaiian and Other Pacific Islander alone
P0010008 Some Other Race alone
P0010009 Population of two or more races:
P0010010 Population of two races:
P0010011 White; Black or African American
P0010012 White; American Indian and Alaska Native
P0010013 White; Asian
P0010014 White; Native Hawaiian and Other Pacific Islander
P0010015 White; Some Other Race
P0010016 Black or African American; American Indian and Alaska Native
P0010017 Black or African American; Asian
P0010018 Black or African American; Native Hawaiian and Other Pacific Islander
P0010019 Black or African American; Some Other Race
P0010020 American Indian and Alaska Native; Asian
P0010021 American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander
P0010022 American Indian and Alaska Native; Some Other Race
P0010023 Asian; Native Hawaiian and Other Pacific Islander
P0010024 Asian; Some Other Race
P0010025 Native Hawaiian and Other Pacific Islander; Some Other Race
P0010026 Population of three races:
P0010027 White; Black or African American; American Indian and Alaska Native
P0010028 White; Black or African American; Asian
P0010029 White; Black or African American; Native Hawaiian and Other Pacific Islander
P0010030 White; Black or African American; Some Other Race
P0010031 White; American Indian and Alaska Native; Asian
P0010032 White; American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander
P0010033 White; American Indian and Alaska Native; Some Other Race
P0010034 White; Asian; Native Hawaiian and Other Pacific Islander
P0010035 White; Asian; Some Other Race
P0010036 White; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010037 Black or African American; American Indian and Alaska Native; Asian
P0010038 Black or African American; American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander
P0010039 Black or African American; American Indian and Alaska Native; Some Other Race
P0010040 Black or African American; Asian; Native Hawaiian and Other Pacific Islander
P0010041 Black or African American; Asian; Some Other Race
P0010042 Black or African American; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010043 American Indian and Alaska Native; Asian; Native Hawaiian and Other Pacific Islander
P0010044 American Indian and Alaska Native; Asian; Some Other Race
P0010045 American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010046 Asian; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010047 Population of four races:
P0010048 White; Black or African American; American Indian and Alaska Native; Asian
P0010049 White; Black or African American; American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander
P0010050 White; Black or African American; American Indian and Alaska Native; Some Other Race
P0010051 White; Black or African American; Asian; Native Hawaiian and Other Pacific Islander
P0010052 White; Black or African American; Asian; Some Other Race
P0010053 White; Black or African American; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010054 White; American Indian and Alaska Native; Asian; Native Hawaiian and Other Pacific Islander
P0010055 White; American Indian and Alaska Native; Asian; Some Other Race
P0010056 White; American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010057 White; Asian; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010058 Black or African American; American Indian and Alaska Native; Asian; Native Hawaiian and Other Pacific Islander
P0010059 Black or African American; American Indian and Alaska Native; Asian; Some Other Race
P0010060 Black or African American; American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010061 Black or African American; Asian; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010062 American Indian and Alaska Native; Asian; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010063 Population of five races:
P0010064 White; Black or African American; American Indian and Alaska Native; Asian; Native Hawaiian and Other Pacific Islander
P0010065 White; Black or African American; American Indian and Alaska Native; Asian; Some Other Race
P0010066 White; Black or African American; American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010067 White; Black or African American; Asian; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010068 White; American Indian and Alaska Native; Asian; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010069 Black or African American; American Indian and Alaska Native; Asian; Native Hawaiian and Other Pacific Islander; Some Other Race
P0010070 Population of six races:
P0010071 White; Black or African American; American Indian and Alaska Native; Asian; Native Hawaiian and Other Pacific Islander; Some Other Race
P0020002 Hispanic or Latino
P0020003 Not Hispanic or Latino:
H0010001 H1-1: Total
H0010002 H1-2: Occupied
H0010003 H1-3: Vacant
P0050001 Total:
P0050002 Institutionalized population:
P0050003 Correctional facilities for adults
P0050004 Juvenile facilities
P0050005 Nursing facilities/Skilled-nursing facilities
P0050006 Other institutional facilities
P0050007 Noninstitutionalized population:
P0050008 College/University student housing
P0050009 Military quarters
P0050010 Other noninstitutional facilities
census_df_sp_sub = census_df_sp.select(["REGION","DIVISION","COUNTY","CBSA","MEMI","CSA","METDIV","AREALAND","AREAWATR","BASENAME","Name","POP100","HU100","INTPTLAT","INTPTLON","LSADC","P0010001", "P0010002", "P0010003", "P0010004", "P0010005", "P0010006", "P0010007", "P0010008", "P0010009", "P0010010", "P0010011", "P0010012", "P0010013", "P0010014", "P0010015", "P0010016", "P0010017", "P0010018", "P0010019", "P0010020", "P0010021", "P0010022", "P0010023", "P0010024", "P0010025", "P0010026", "P0010027", "P0010028", "P0010029", "P0010030", "P0010031", "P0010032", "P0010033", "P0010034", "P0010035", "P0010036", "P0010037", "P0010038", "P0010039", "P0010040", "P0010041", "P0010042", "P0010043", "P0010044", "P0010045", "P0010046", "P0010047", "P0010048", "P0010049", "P0010050", "P0010051", "P0010052", "P0010053", "P0010054", "P0010055", "P0010056", "P0010057", "P0010058", "P0010059", "P0010060", "P0010061", "P0010062", "P0010063", "P0010064", "P0010065", "P0010066", "P0010067", "P0010068", "P0010069", "P0010070", "P0010071", "P0020002", "P0020003", "H0010001", "H0010002", "H0010003", "P0050001", "P0050002", "P0050003", "P0050004", "P0050005", "P0050006", "P0050007", "P0050008", "P0050009", "P0050010"])
#display(census_df_sp_sub)
#df.colRegex("P001")
display(census_df_sp_sub.summary())
summary | REGION | DIVISION | COUNTY | CBSA | MEMI | CSA | METDIV | AREALAND | AREAWATR | BASENAME | Name | POP100 | HU100 | INTPTLAT | INTPTLON | LSADC | P0010001 | P0010002 | P0010003 | P0010004 | P0010005 | P0010006 | P0010007 | P0010008 | P0010009 | P0010010 | P0010011 | P0010012 | P0010013 | P0010014 | P0010015 | P0010016 | P0010017 | P0010018 | P0010019 | P0010020 | P0010021 | P0010022 | P0010023 | P0010024 | P0010025 | P0010026 | P0010027 | P0010028 | P0010029 | P0010030 | P0010031 | P0010032 | P0010033 | P0010034 | P0010035 | P0010036 | P0010037 | P0010038 | P0010039 | P0010040 | P0010041 | P0010042 | P0010043 | P0010044 | P0010045 | P0010046 | P0010047 | P0010048 | P0010049 | P0010050 | P0010051 | P0010052 | P0010053 | P0010054 | P0010055 | P0010056 | P0010057 | P0010058 | P0010059 | P0010060 | P0010061 | P0010062 | P0010063 | P0010064 | P0010065 | P0010066 | P0010067 | P0010068 | P0010069 | P0010070 | P0010071 | P0020002 | P0020003 | H0010001 | H0010002 | H0010003 | P0050001 | P0050002 | P0050003 | P0050004 | P0050005 | P0050006 | P0050007 | P0050008 | P0050009 | P0050010 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 669171 | 669171 | 650290 | 650290 | 650290 | 650290 | 650290 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 | 669171 |
mean | 4.0 | 9.0 | 56.366931676636575 | 38679.26071752603 | 1.3898522197788679 | 554.2995555828937 | 76237.78717956605 | 2.8677515498694655E7 | 1445348.4535836133 | 1832.536732234136 | null | 3059.615147697674 | 1113.5215647420464 | 36.032115277906854 | -119.64066830452249 | 26.733684030450775 | 3059.615147697674 | 2613.714258388364 | 1258.4589409881778 | 173.5204469410659 | 48.9568824709977 | 472.2799613252816 | 12.19083612409982 | 648.3071905387412 | 445.9008893093096 | 416.4643730825155 | 19.981495013979984 | 29.66331176933848 | 48.420317078893135 | 3.5101820013120713 | 280.72336816747884 | 2.8957964406706207 | 4.014609120837574 | 0.6792180175171967 | 6.895657462741212 | 1.020704722709143 | 0.19860244989696207 | 8.816177030983113 | 3.9774721259588355 | 4.740206912732321 | 0.9272547674660139 | 26.405721407532603 | 3.201568806777341 | 1.8671311219404307 | 0.2581283408874563 | 2.5513657943933614 | 1.7074604249138112 | 0.2554354567068806 | 9.24852093112224 | 2.400646471529699 | 3.273394991713628 | 0.328717173936109 | 0.2000086674407588 | 0.04289635982431994 | 0.2618583291864112 | 0.20435733168353082 | 0.20147466043806442 | 0.04687889941434999 | 0.07420076482692764 | 0.113359664420604 | 0.024413789599369965 | 0.14390342677731102 | 2.6784095545084887 | 0.503422891906553 | 0.0758356234803959 | 0.9203895566305175 | 0.15077909831717154 | 0.2116663752613308 | 0.02595001875454854 | 0.19549860947351275 | 0.33209598144569924 | 0.03252681302686458 | 0.16080941941596394 | 0.020153294150523558 | 0.021743321213860134 | 0.005952140783148104 | 0.01316703802167159 | 0.008419372626727697 | 0.31766768135498996 | 0.103903187675497 | 0.1549753351534959 | 0.018213580684159954 | 0.016043731721787108 | 0.02205265918576866 | 0.002479186934281372 | 0.034717583397965546 | 0.034717583397965546 | 1206.5007987494976 | 1853.1143489481763 | 1113.5215647420464 | 1042.8784271882673 | 70.64313755377923 | 70.68672282570523 | 26.4018389918272 | 15.296785126671658 | 0.6814057991156222 | 9.684245133157294 | 0.7394029328826264 | 44.284883833878034 | 17.845792779424094 | 4.347791521150797 | 22.091299533303147 |
stddev | 0.0 | 0.0 | 28.31755854283744 | 15294.999450959238 | 1.623580576183806 | 270.1235591357882 | 33581.03897460754 | 7.881045914630078E8 | 4.494612437568705E7 | 1362.64644400938 | null | 66405.47319871612 | 24431.340821414025 | 2.3066087112499662 | 2.0902436730725986 | 25.362129522066663 | 66405.47319871612 | 56853.01741716403 | 26526.04242970642 | 4595.227009682515 | 1060.2947009505854 | 11095.468936577807 | 278.1856483724344 | 15286.754026573077 | 9606.180301680255 | 8971.494547229793 | 448.8439672083964 | 590.8450733585147 | 1074.9128948779255 | 72.65506501955869 | 6136.860344046799 | 71.65700168239906 | 94.854026160174 | 15.98677244280636 | 179.10509565816344 | 22.166134660665143 | 4.2246339388363285 | 206.5055051043879 | 87.47833887109377 | 106.04737114758841 | 21.221856045450323 | 573.1525791990103 | 72.15967177677882 | 42.057610874354104 | 5.747666701614405 | 57.86506363067629 | 35.35640895983642 | 5.169246161049264 | 204.17508670962263 | 51.408998717980424 | 70.47563280844962 | 6.91416990943495 | 4.636601663755835 | 1.102240632199946 | 6.325829834166733 | 4.855388638690167 | 4.731520760612667 | 1.1410024230780134 | 1.5805620556340394 | 2.4675008146077446 | 0.5354034287121976 | 3.1674736799695964 | 59.43995607021075 | 11.468646987613933 | 1.7201859377688478 | 21.449689415311948 | 3.4206981519379043 | 4.873442687524608 | 0.5832214962803234 | 4.109139259114911 | 7.215087980828405 | 0.6897207574776267 | 3.441005165520258 | 0.5021498570588202 | 0.49869988729761555 | 0.17870930429585002 | 0.2965270764180188 | 0.20318667124143872 | 7.078996390196105 | 2.3648623549130847 | 3.568688830888468 | 0.3975957995646648 | 0.3813182936499723 | 0.5296301026671097 | 0.0831706430900629 | 0.8453477335540394 | 0.8453477335540394 | 27432.786453061643 | 40094.052441055195 | 24431.340821414025 | 22903.924030413393 | 1604.306770712246 | 1662.1753646137315 | 577.9366182088413 | 372.60041352603315 | 16.466467763815917 | 227.47278374446967 | 25.182709662978187 | 1168.0248620705302 | 555.4863597123475 | 221.16130607469864 | 568.7430289189296 |
min | 4 | 9 | 1.0 | 12540.0 | 1.0 | 260.0 | 11244.0 | 0 | 0 | 0 | ABC Unified School District | 0 | 0 | 32.5395511 | -124.4136928 | 00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
25% | 4 | 9 | 37.0 | 31080.0 | 1.0 | 348.0 | 36084.0 | 16661 | 0 | 1007.0 | null | 7 | 2 | 34.0052005 | -121.6376844 | 0.0 | 7 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
50% | 4 | 9 | 59.0 | 40140.0 | 1.0 | 472.0 | 99999.0 | 51601 | 0 | 2001.0 | null | 61 | 21 | 35.4357468 | -119.2696632 | 22.0 | 61 | 50 | 23 | 0 | 0 | 2 | 0 | 5 | 9 | 8 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 33 | 21 | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
75% | 4 | 9 | 75.0 | 41860.0 | 1.0 | 999.0 | 99999.0 | 638110 | 0 | 3000.0 | null | 246 | 93 | 37.866253799999996 | -117.930457 | 57.0 | 246 | 213 | 105 | 10 | 4 | 29 | 1 | 48 | 36 | 34 | 2 | 3 | 4 | 0 | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 96 | 148 | 93 | 87 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
max | 4 | 9 | 115.0 | 99999.0 | 9.0 | 999.0 | 99999.0 | 282055294521 | 15719629492 | Zayante | Zayante CDP | 28573858 | 10571847 | 42.0074191 | -114.13926399999998 | OT | 28573858 | 24485003 | 11576460 | 1799185 | 447548 | 4474283 | 115115 | 6072412 | 4088855 | 3811637 | 194177 | 262089 | 459478 | 31625 | 2532955 | 29671 | 40474 | 6789 | 69354 | 9699 | 1847 | 82008 | 37940 | 44752 | 8779 | 247939 | 31244 | 18189 | 2467 | 24478 | 15573 | 2273 | 85549 | 22286 | 29988 | 2983 | 1983 | 455 | 2648 | 2083 | 1976 | 475 | 682 | 1045 | 211 | 1351 | 25817 | 5001 | 746 | 8969 | 1473 | 2086 | 243 | 1780 | 3058 | 293 | 1507 | 209 | 206 | 64 | 105 | 77 | 3120 | 1043 | 1529 | 164 | 153 | 208 | 23 | 342 | 342 | 11128716 | 17445142 | 10571847 | 9866472 | 705375 | 696051 | 234511 | 126683 | 6408 | 93493 | 7927 | 461540 | 197511 | 39597 | 226824 |
data_ks_sub = data_ks[["REGION","DIVISION","COUNTY","CBSA","MEMI","CSA","METDIV","AREALAND","AREAWATR","BASENAME","NAME","POP100","HU100","INTPTLAT","INTPTLON","LSADC","P0010001", "P0010002", "P0010003", "P0010004", "P0010005", "P0010006", "P0010007", "P0010008", "P0010009", "P0010010", "P0010011", "P0010012", "P0010013", "P0010014", "P0010015", "P0010016", "P0010017", "P0010018", "P0010019", "P0010020", "P0010021", "P0010022", "P0010023", "P0010024", "P0010025", "P0010026", "P0010027", "P0010028", "P0010029", "P0010030", "P0010031", "P0010032", "P0010033", "P0010034", "P0010035", "P0010036", "P0010037", "P0010038", "P0010039", "P0010040", "P0010041", "P0010042", "P0010043", "P0010044", "P0010045", "P0010046", "P0010047", "P0010048", "P0010049", "P0010050", "P0010051", "P0010052", "P0010053", "P0010054", "P0010055", "P0010056", "P0010057", "P0010058", "P0010059", "P0010060", "P0010061", "P0010062", "P0010063", "P0010064", "P0010065", "P0010066", "P0010067", "P0010068", "P0010069", "P0010070", "P0010071", "P0020002", "P0020003", "H0010001", "H0010002", "H0010003", "P0050001", "P0050002", "P0050003", "P0050004", "P0050005", "P0050006", "P0050007", "P0050008", "P0050009", "P0050010"]]
#data_ks_sub.dtypes
data_ks_sub.head()
REGION | DIVISION | COUNTY | CBSA | MEMI | CSA | METDIV | AREALAND | AREAWATR | BASENAME | NAME | POP100 | HU100 | INTPTLAT | INTPTLON | LSADC | P0010001 | P0010002 | P0010003 | P0010004 | P0010005 | P0010006 | P0010007 | P0010008 | P0010009 | P0010010 | P0010011 | P0010012 | P0010013 | P0010014 | P0010015 | P0010016 | P0010017 | P0010018 | P0010019 | P0010020 | P0010021 | P0010022 | P0010023 | P0010024 | P0010025 | P0010026 | P0010027 | P0010028 | P0010029 | P0010030 | P0010031 | P0010032 | P0010033 | P0010034 | P0010035 | P0010036 | P0010037 | P0010038 | P0010039 | P0010040 | P0010041 | P0010042 | P0010043 | P0010044 | P0010045 | P0010046 | P0010047 | P0010048 | P0010049 | P0010050 | P0010051 | P0010052 | P0010053 | P0010054 | P0010055 | P0010056 | P0010057 | P0010058 | P0010059 | P0010060 | P0010061 | P0010062 | P0010063 | P0010064 | P0010065 | P0010066 | P0010067 | P0010068 | P0010069 | P0010070 | P0010071 | P0020002 | P0020003 | H0010001 | H0010002 | H0010003 | P0050001 | P0050002 | P0050003 | P0050004 | P0050005 | P0050006 | P0050007 | P0050008 | P0050009 | P0050010 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4 | 9 | 1.0 | 41860.0 | 1.0 | 488.0 | 36084.0 | 1910017353 | 216902808 | Alameda | Alameda County | 1682353 | 621958 | 37.647139 | -121.912488 | 06 | 1682353 | 1491537 | 523836 | 164879 | 19659 | 545261 | 14123 | 223779 | 190816 | 171703 | 13579 | 10768 | 39888 | 2293 | 81119 | 2682 | 4125 | 640 | 4182 | 707 | 151 | 3685 | 3684 | 3285 | 915 | 16711 | 2460 | 1652 | 154 | 1551 | 1066 | 137 | 4502 | 1704 | 2121 | 206 | 176 | 34 | 231 | 205 | 194 | 30 | 64 | 77 | 20 | 127 | 2112 | 404 | 49 | 819 | 110 | 159 | 22 | 124 | 257 | 21 | 105 | 25 | 8 | 0 | 6 | 3 | 262 | 95 | 129 | 14 | 12 | 12 | 0 | 28 | 28 | 393749 | 1288604 | 621958 | 591636 | 30322 | 53833 | 10130 | 3406 | 397 | 6218 | 109 | 43703 | 17463 | 421 | 25819 |
1 | 4 | 9 | 3.0 | 99999.0 | 9.0 | 999.0 | 99999.0 | 1912292607 | 12557304 | Alpine | Alpine County | 1204 | 1540 | 38.621783 | -119.798352 | 06 | 1204 | 1085 | 814 | 10 | 236 | 12 | 0 | 13 | 119 | 89 | 5 | 10 | 6 | 1 | 50 | 5 | 2 | 1 | 0 | 1 | 1 | 3 | 2 | 2 | 0 | 20 | 2 | 0 | 4 | 0 | 0 | 2 | 3 | 0 | 4 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 9 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 84 | 1120 | 1540 | 530 | 1010 | 52 | 1 | 1 | 0 | 0 | 0 | 51 | 0 | 0 | 51 |
2 | 4 | 9 | 5.0 | 99999.0 | 9.0 | 999.0 | 99999.0 | 1539965777 | 29438413 | Amador | Amador County | 40474 | 18805 | 38.443549 | -120.653858 | 06 | 40474 | 36596 | 31104 | 1236 | 757 | 582 | 82 | 2835 | 3878 | 3591 | 163 | 1123 | 307 | 59 | 1804 | 20 | 5 | 0 | 6 | 18 | 2 | 15 | 59 | 6 | 4 | 261 | 28 | 7 | 4 | 12 | 34 | 11 | 87 | 26 | 19 | 5 | 2 | 1 | 1 | 8 | 1 | 7 | 0 | 0 | 0 | 8 | 17 | 0 | 2 | 4 | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 9 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 6014 | 34460 | 18805 | 15678 | 3127 | 4311 | 4098 | 4045 | 2 | 51 | 0 | 213 | 0 | 0 | 213 |
3 | 4 | 9 | 7.0 | 17020.0 | 1.0 | 999.0 | 99999.0 | 4238488698 | 105260548 | Butte | Butte County | 211632 | 90133 | 39.665336 | -121.603209 | 06 | 211632 | 186947 | 149557 | 3644 | 4492 | 10533 | 573 | 18148 | 24685 | 22849 | 1764 | 6709 | 2396 | 388 | 10402 | 177 | 93 | 24 | 157 | 69 | 22 | 374 | 172 | 84 | 18 | 1652 | 345 | 97 | 16 | 124 | 162 | 61 | 480 | 182 | 87 | 21 | 16 | 1 | 21 | 4 | 7 | 0 | 5 | 16 | 1 | 6 | 156 | 30 | 7 | 41 | 1 | 6 | 7 | 29 | 23 | 2 | 7 | 0 | 2 | 0 | 0 | 1 | 28 | 12 | 11 | 2 | 3 | 0 | 0 | 0 | 0 | 40112 | 171520 | 90133 | 83268 | 6865 | 4941 | 1449 | 496 | 31 | 902 | 20 | 3492 | 2234 | 0 | 1258 |
4 | 4 | 9 | 9.0 | 99999.0 | 9.0 | 999.0 | 99999.0 | 2641837359 | 43789489 | Calaveras | Calaveras County | 45292 | 27422 | 38.191068 | -120.554106 | 06 | 45292 | 40264 | 36315 | 364 | 747 | 743 | 100 | 1995 | 5028 | 4706 | 195 | 1513 | 428 | 110 | 2285 | 13 | 13 | 0 | 24 | 6 | 6 | 43 | 39 | 28 | 3 | 285 | 15 | 8 | 2 | 28 | 25 | 24 | 104 | 39 | 21 | 0 | 8 | 0 | 2 | 0 | 0 | 0 | 0 | 5 | 0 | 4 | 31 | 6 | 1 | 4 | 0 | 7 | 0 | 4 | 0 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 5 | 3 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 5865 | 39427 | 27422 | 18758 | 8664 | 461 | 311 | 170 | 40 | 101 | 0 | 150 | 0 | 0 | 150 |
data_ks_sub.describe()
REGION | DIVISION | COUNTY | CBSA | MEMI | CSA | METDIV | AREALAND | AREAWATR | POP100 | HU100 | INTPTLAT | INTPTLON | P0010001 | P0010002 | P0010003 | P0010004 | P0010005 | P0010006 | P0010007 | P0010008 | P0010009 | P0010010 | P0010011 | P0010012 | P0010013 | P0010014 | P0010015 | P0010016 | P0010017 | P0010018 | P0010019 | P0010020 | P0010021 | P0010022 | P0010023 | P0010024 | P0010025 | P0010026 | P0010027 | P0010028 | P0010029 | P0010030 | P0010031 | P0010032 | P0010033 | P0010034 | P0010035 | P0010036 | P0010037 | P0010038 | P0010039 | P0010040 | P0010041 | P0010042 | P0010043 | P0010044 | P0010045 | P0010046 | P0010047 | P0010048 | P0010049 | P0010050 | P0010051 | P0010052 | P0010053 | P0010054 | P0010055 | P0010056 | P0010057 | P0010058 | P0010059 | P0010060 | P0010061 | P0010062 | P0010063 | P0010064 | P0010065 | P0010066 | P0010067 | P0010068 | P0010069 | P0010070 | P0010071 | P0020002 | P0020003 | H0010001 | H0010002 | H0010003 | P0050001 | P0050002 | P0050003 | P0050004 | P0050005 | P0050006 | P0050007 | P0050008 | P0050009 | P0050010 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 669171.0 | 669171.0 | 650290.000000 | 650290.000000 | 650290.000000 | 650290.000000 | 650290.000000 | 6.691710e+05 | 6.691710e+05 | 6.691710e+05 | 6.691710e+05 | 669171.000000 | 669171.000000 | 6.691710e+05 | 6.691710e+05 | 6.691710e+05 | 6.691710e+05 | 669171.000000 | 6.691710e+05 | 669171.000000 | 6.691710e+05 | 6.691710e+05 | 6.691710e+05 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 6.691710e+05 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 6.691710e+05 | 6.691710e+05 | 6.691710e+05 | 6.691710e+05 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 | 669171.000000 |
mean | 4.0 | 9.0 | 56.366932 | 38679.260718 | 1.389852 | 554.299556 | 76237.787180 | 2.867752e+07 | 1.445348e+06 | 3.059615e+03 | 1.113522e+03 | 36.032115 | -119.640668 | 3.059615e+03 | 2.613714e+03 | 1.258459e+03 | 1.735204e+02 | 48.956882 | 4.722800e+02 | 12.190836 | 6.483072e+02 | 4.459009e+02 | 4.164644e+02 | 19.981495 | 29.663312 | 48.420317 | 3.510182 | 2.807234e+02 | 2.895796 | 4.014609 | 0.679218 | 6.895657 | 1.020705 | 0.198602 | 8.816177 | 3.977472 | 4.740207 | 0.927255 | 26.405721 | 3.201569 | 1.867131 | 0.258128 | 2.551366 | 1.707460 | 0.255435 | 9.248521 | 2.400646 | 3.273395 | 0.328717 | 0.200009 | 0.042896 | 0.261858 | 0.204357 | 0.201475 | 0.046879 | 0.074201 | 0.113360 | 0.024414 | 0.143903 | 2.678410 | 0.503423 | 0.075836 | 0.920390 | 0.150779 | 0.211666 | 0.025950 | 0.195499 | 0.332096 | 0.032527 | 0.160809 | 0.020153 | 0.021743 | 0.005952 | 0.013167 | 0.008419 | 0.317668 | 0.103903 | 0.154975 | 0.018214 | 0.016044 | 0.022053 | 0.002479 | 0.034718 | 0.034718 | 1.206501e+03 | 1.853114e+03 | 1.113522e+03 | 1.042878e+03 | 70.643138 | 70.686723 | 26.401839 | 15.296785 | 0.681406 | 9.684245 | 0.739403 | 44.284884 | 17.845793 | 4.347792 | 22.091300 |
std | 0.0 | 0.0 | 28.317559 | 15294.999451 | 1.623581 | 270.123559 | 33581.038975 | 7.881046e+08 | 4.494612e+07 | 6.640547e+04 | 2.443134e+04 | 2.306609 | 2.090244 | 6.640547e+04 | 5.685302e+04 | 2.652604e+04 | 4.595227e+03 | 1060.294701 | 1.109547e+04 | 278.185648 | 1.528675e+04 | 9.606180e+03 | 8.971495e+03 | 448.843967 | 590.845073 | 1074.912895 | 72.655065 | 6.136860e+03 | 71.657002 | 94.854026 | 15.986772 | 179.105096 | 22.166135 | 4.224634 | 206.505505 | 87.478339 | 106.047371 | 21.221856 | 573.152579 | 72.159672 | 42.057611 | 5.747667 | 57.865064 | 35.356409 | 5.169246 | 204.175087 | 51.408999 | 70.475633 | 6.914170 | 4.636602 | 1.102241 | 6.325830 | 4.855389 | 4.731521 | 1.141002 | 1.580562 | 2.467501 | 0.535403 | 3.167474 | 59.439956 | 11.468647 | 1.720186 | 21.449689 | 3.420698 | 4.873443 | 0.583221 | 4.109139 | 7.215088 | 0.689721 | 3.441005 | 0.502150 | 0.498700 | 0.178709 | 0.296527 | 0.203187 | 7.078996 | 2.364862 | 3.568689 | 0.397596 | 0.381318 | 0.529630 | 0.083171 | 0.845348 | 0.845348 | 2.743279e+04 | 4.009405e+04 | 2.443134e+04 | 2.290392e+04 | 1604.306771 | 1662.175365 | 577.936618 | 372.600414 | 16.466468 | 227.472784 | 25.182710 | 1168.024862 | 555.486360 | 221.161306 | 568.743029 |
min | 4.0 | 9.0 | 1.000000 | 12540.000000 | 1.000000 | 260.000000 | 11244.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 32.539551 | -124.413693 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 4.0 | 9.0 | 37.000000 | 31080.000000 | 1.000000 | 348.000000 | 36084.000000 | 1.665900e+04 | 0.000000e+00 | 7.000000e+00 | 2.000000e+00 | 34.005323 | -121.637698 | 7.000000e+00 | 5.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 2.000000e+00 | 2.000000e+00 | 1.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 4.0 | 9.0 | 59.000000 | 40140.000000 | 1.000000 | 472.000000 | 99999.000000 | 5.159000e+04 | 0.000000e+00 | 6.100000e+01 | 2.100000e+01 | 35.434519 | -119.269756 | 6.100000e+01 | 5.000000e+01 | 2.300000e+01 | 0.000000e+00 | 0.000000 | 2.000000e+00 | 0.000000 | 5.000000e+00 | 9.000000e+00 | 8.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.500000e+01 | 3.300000e+01 | 2.100000e+01 | 1.900000e+01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 4.0 | 9.0 | 75.000000 | 41860.000000 | 1.000000 | 999.000000 | 99999.000000 | 6.380330e+05 | 0.000000e+00 | 2.460000e+02 | 9.300000e+01 | 37.866010 | -117.930126 | 2.460000e+02 | 2.130000e+02 | 1.050000e+02 | 1.000000e+01 | 4.000000 | 2.900000e+01 | 1.000000 | 4.800000e+01 | 3.600000e+01 | 3.400000e+01 | 2.000000 | 3.000000 | 4.000000 | 0.000000 | 2.400000e+01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.600000e+01 | 1.480000e+02 | 9.300000e+01 | 8.700000e+01 | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 4.0 | 9.0 | 115.000000 | 99999.000000 | 9.000000 | 999.000000 | 99999.000000 | 2.820553e+11 | 1.571963e+10 | 2.857386e+07 | 1.057185e+07 | 42.007419 | -114.139264 | 2.857386e+07 | 2.448500e+07 | 1.157646e+07 | 1.799185e+06 | 447548.000000 | 4.474283e+06 | 115115.000000 | 6.072412e+06 | 4.088855e+06 | 3.811637e+06 | 194177.000000 | 262089.000000 | 459478.000000 | 31625.000000 | 2.532955e+06 | 29671.000000 | 40474.000000 | 6789.000000 | 69354.000000 | 9699.000000 | 1847.000000 | 82008.000000 | 37940.000000 | 44752.000000 | 8779.000000 | 247939.000000 | 31244.000000 | 18189.000000 | 2467.000000 | 24478.000000 | 15573.000000 | 2273.000000 | 85549.000000 | 22286.000000 | 29988.000000 | 2983.000000 | 1983.000000 | 455.000000 | 2648.000000 | 2083.000000 | 1976.000000 | 475.000000 | 682.000000 | 1045.000000 | 211.000000 | 1351.000000 | 25817.000000 | 5001.000000 | 746.000000 | 8969.000000 | 1473.000000 | 2086.000000 | 243.000000 | 1780.000000 | 3058.000000 | 293.000000 | 1507.000000 | 209.000000 | 206.000000 | 64.000000 | 105.000000 | 77.000000 | 3120.000000 | 1043.000000 | 1529.000000 | 164.000000 | 153.000000 | 208.000000 | 23.000000 | 342.000000 | 342.000000 | 1.112872e+07 | 1.744514e+07 | 1.057185e+07 | 9.866472e+06 | 705375.000000 | 696051.000000 | 234511.000000 | 126683.000000 | 6408.000000 | 93493.000000 | 7927.000000 | 461540.000000 | 197511.000000 | 39597.000000 | 226824.000000 |
Noticing the following issues from the 'describe' table: Some counties have 0 value for POP100 (population count 100%), HU100 (Housing count 100%), H0010001 (Total) ????
# Plot NAME vs POP100
plot_df = data_ks_sub[["NAME","POP100","INTPTLAT","INTPTLON"]]
plot_df.rename(columns={"POP100":"Total Population"}, inplace=True)
plot_df.head()
NAME | Total Population | INTPTLAT | INTPTLON | |
---|---|---|---|---|
0 | Alameda County | 1682353 | 37.647139 | -121.912488 |
1 | Alpine County | 1204 | 38.621783 | -119.798352 |
2 | Amador County | 40474 | 38.443549 | -120.653858 |
3 | Butte County | 211632 | 39.665336 | -121.603209 |
4 | Calaveras County | 45292 | 38.191068 | -120.554106 |
plot_df.dtypes
#plot_df["NAME"].value_counts()
The NAME column has County / City / Block / Census Tract / School District. Separating County data
pd_temp = plot_df.toPandas()
pd_temp_city = pd_temp[pd_temp["NAME"].str.contains('city', regex=False)].drop_duplicates(subset=["NAME"])
pd_temp_county = pd_temp[pd_temp["NAME"].str.contains('County', regex=False)].drop_duplicates(subset=["NAME"])
pd_temp_school_district = pd_temp[pd_temp["NAME"].str.contains('School District', regex=False)].drop_duplicates(subset=["NAME"])
pd_temp_census_tract = pd_temp[pd_temp["NAME"].str.contains('Census Tract', regex=False)].drop_duplicates(subset=["NAME"])
pd_temp_block = pd_temp[pd_temp["NAME"].str.contains('Block', regex=False)].drop_duplicates(subset=["NAME"])
plt.figure(figsize=(25,10))
splot = sns.barplot(x="NAME",y="Total Population", palette="viridis", data=pd_temp_city.sample(n=50, random_state=1))
splot.axes.set_title("City population",fontsize=40)
splot.set_xlabel("Cities",fontsize=20)
splot.tick_params(labelsize=13)
for item in splot.get_xticklabels():
item.set_rotation(90)
plt.figure(figsize=(25,10))
splot = sns.barplot(x="NAME",y="Total Population", palette="viridis", data=pd_temp_county.sample(n=100, random_state=1))
splot.axes.set_title("County population",fontsize=40)
splot.set_xlabel("Counties",fontsize=20)
splot.tick_params(labelsize=13)
for item in splot.get_xticklabels():
item.set_rotation(90)
plt.figure(figsize=(25,10))
splot = sns.barplot(x="NAME",y="Total Population", palette="viridis", data=pd_temp_school_district.sample(n=50, random_state=1))
splot.axes.set_title("School District population",fontsize=40)
splot.set_xlabel("School Districts",fontsize=20)
splot.tick_params(labelsize=13)
for item in splot.get_xticklabels():
item.set_rotation(90)
plt.figure(figsize=(25,10))
splot = sns.barplot(x="NAME",y="Total Population", palette="viridis", data=pd_temp_census_tract.sample(n=50, random_state=1))
splot.axes.set_title("Census Tract - population",fontsize=40)
splot.set_xlabel("Census Tracts",fontsize=20)
splot.tick_params(labelsize=13)
for item in splot.get_xticklabels():
item.set_rotation(90)
# For a NAME, check the distribution of population by one race
# P0010001 Total:
# P0010002 Population of one race:
# P0010003 White alone
# P0010004 Black or African American alone
# P0010005 American Indian and Alaska Native alone
# P0010006 Asian alone
# P0010007 Native Hawaiian and Other Pacific Islander alone
# P0010008 Some Other Race alone
# P0020002 Hispanic or Latino
plot_df2 = data_ks_sub[["NAME","P0010003","P0010004","P0010005","P0010006","P0010007","P0010008","P0020002"]]
plot_df2.rename(columns={"P0010003":"White","P0010004":"Black or African American","P0010005":"American Indian and Alaska Native","P0010006":"Asian","P0010007":"Native Hawaiian and Other Pacific Islander","P0010008":"Some Other Race","P0020002":"Hispanic or Latino"}, inplace=True)
plot_df2.head()
NAME | White | Black or African American | American Indian and Alaska Native | Asian | Native Hawaiian and Other Pacific Islander | Some Other Race | Hispanic or Latino | |
---|---|---|---|---|---|---|---|---|
0 | Alameda County | 523836 | 164879 | 19659 | 545261 | 14123 | 223779 | 393749 |
1 | Alpine County | 814 | 10 | 236 | 12 | 0 | 13 | 84 |
2 | Amador County | 31104 | 1236 | 757 | 582 | 82 | 2835 | 6014 |
3 | Butte County | 149557 | 3644 | 4492 | 10533 | 573 | 18148 | 40112 |
4 | Calaveras County | 36315 | 364 | 747 | 743 | 100 | 1995 | 5865 |
pd_temp2 = plot_df2.toPandas()
pd_temp_SD = pd_temp2[pd_temp2["NAME"] == 'San Diego County']
pd_temp_SD
NAME | White | Black or African American | American Indian and Alaska Native | Asian | Native Hawaiian and Other Pacific Islander | Some Other Race | Hispanic or Latino | |
---|---|---|---|---|---|---|---|---|
36 | San Diego County | 1633129 | 155813 | 40968 | 410752 | 15286 | 520994 | 1119629 |
pd_temp_SD = pd_temp_SD.reset_index(drop=True)
pd_temp_SD
NAME | White | Black or African American | American Indian and Alaska Native | Asian | Native Hawaiian and Other Pacific Islander | Some Other Race | Hispanic or Latino | |
---|---|---|---|---|---|---|---|---|
0 | San Diego County | 1633129 | 155813 | 40968 | 410752 | 15286 | 520994 | 1119629 |
pd_temp_SD = pd_temp_SD.drop(["NAME"], axis=1)
pd_temp_SD
White | Black or African American | American Indian and Alaska Native | Asian | Native Hawaiian and Other Pacific Islander | Some Other Race | Hispanic or Latino | |
---|---|---|---|---|---|---|---|
0 | 1633129 | 155813 | 40968 | 410752 | 15286 | 520994 | 1119629 |
pd_temp_SD_T = pd_temp_SD.T
#pd_temp_SD_T = pd_temp_SD_T.rename(columns={"Population"})
pd_temp_SD_T
0 | |
---|---|
White | 1633129 |
Black or African American | 155813 |
American Indian and Alaska Native | 40968 |
Asian | 410752 |
Native Hawaiian and Other Pacific Islander | 15286 |
Some Other Race | 520994 |
Hispanic or Latino | 1119629 |
plt.figure(figsize=(35,10))
ax = pd_temp_SD_T.plot.bar()
plt.title("San Diego Population distribution by ethnicity")
pd_temp_city.head()
NAME | Total Population | INTPTLAT | INTPTLON | |
---|---|---|---|---|
36816 | Adelanto city | 38046 | 34.580902 | -117.439458 |
36818 | Agoura Hills city | 20299 | 34.148942 | -118.763930 |
36823 | Alameda city | 78280 | 37.741911 | -122.259914 |
36825 | Albany city | 20271 | 37.890650 | -122.318116 |
36828 | Alhambra city | 82868 | 34.083571 | -118.136444 |
pd_temp_city.shape
pd_temp_city_summary = pd_temp_city.describe()
pd_temp_city_summary
Total Population | INTPTLAT | INTPTLON | |
---|---|---|---|
count | 6.750000e+02 | 675.000000 | 675.000000 |
mean | 6.075501e+04 | 35.913016 | -119.678155 |
std | 1.791500e+05 | 2.252842 | 2.139734 |
min | 0.000000e+00 | 32.569140 | -124.261239 |
25% | 7.344000e+03 | 33.930746 | -121.826875 |
50% | 2.703800e+04 | 35.194361 | -119.318750 |
75% | 6.898100e+04 | 37.830509 | -117.869322 |
max | 3.898747e+06 | 41.964965 | -114.593545 |
pd_temp_city_summary.shape
max_pop_val = pd_temp_city_summary["Total Population"]["max"]
max_pop_val
pd_temp_city["Circle_radius"] = pd_temp_city["Total Population"].apply(lambda x: x / max_pop_val)
pd_temp_city.head()
NAME | Total Population | INTPTLAT | INTPTLON | Circle_radius | |
---|---|---|---|---|---|
36816 | Adelanto city | 38046 | 34.580902 | -117.439458 | 0.009759 |
36818 | Agoura Hills city | 20299 | 34.148942 | -118.763930 | 0.005207 |
36823 | Alameda city | 78280 | 37.741911 | -122.259914 | 0.020078 |
36825 | Albany city | 20271 | 37.890650 | -122.318116 | 0.005199 |
36828 | Alhambra city | 82868 | 34.083571 | -118.136444 | 0.021255 |
pd_temp_city = pd_temp_city.drop_duplicates(subset=["NAME"])
pd_temp_city.shape
pd_temp_city["Total Population"].describe()
pd_temp_city[pd_temp_city["NAME"] == 'Santa Barbara city']
NAME | Total Population | INTPTLAT | INTPTLON | Circle_radius | |
---|---|---|---|---|---|
38103 | Santa Barbara city | 88665 | 34.400601 | -119.712954 | 0.022742 |
#import folium as f
# Make an empty map
#mp = f.Map(location=[37,-119], tiles="OpenStreetMap", zoom_start=6)
#mp = f.Map(location=[37,-119], control_scale=True, zoom_start=6)
#mp
import folium as f
# Make an empty map
mp = f.Map(location=[37,-119], control_scale=True, zoom_start=7)
#data_m=pd_temp_city.sample(n=500, random_state=1)
data_m=pd_temp_city
for i in range(0,data_m.shape[0]):
popup = """
#City Name : <b>%s</b><br>
#Population : <b>%s</b><br>
""" % (data_m.iloc[i]["NAME"],
data_m.iloc[i]["Total Population"])
f.Circle(
location=[data_m.iloc[i]['INTPTLAT'], data_m.iloc[i]['INTPTLON']],
tooltip=popup,
#popup=data_m.iloc[i]['NAME'],
radius=float(data_m.iloc[i]['Circle_radius'])*50000,
color='blue',
fill=True,
#fill_color='blue'
).add_to(mp)
mp
#import webbrowser
#map_osm = folium.Map(location=[45.5236, -122.6750])
#mp.save('map.html')
#webbrowser.open('map.html')
#html_map = mp._repr_html_()
#displayHTML(html_map)
from IPython.core.display import HTML
HTML(mp._repr_html_())
#mp.save("/FileStore/sukumar/Ca_census_map.html")
#dbutils.fs.put("/FileStore/sukumar/Ca_census_map.html", mp)
#P0050002 Institutionalized population:
#P0050003 Correctional facilities for adults
#P0050004 Juvenile facilities
#P0050005 Nursing facilities/Skilled-nursing facilities
#P0050006 Other institutional facilities
#P0050007 Noninstitutionalized population:
df_k_means = data_ks_sub[["NAME","P0050002","P0050003","P0050004","P0050005","P0050006","P0050007"]].toPandas()
df_k_means.rename(columns={"P0050002":"Institutionalized population","P0050003":"Correctional facilities for adults","P0050004":"Juvenile facilities","P0050005":"Nursing facilities/Skilled-nursing facilities","P0050006":"Other institutional facilities","P0050007":"Noninstitutionalized population"}, inplace=True)
df_k_means = df_k_means[df_k_means["NAME"].str.contains('city', regex=False)].drop_duplicates(subset=["NAME"])
df_k_means.head()
NAME | Institutionalized population | Correctional facilities for adults | Juvenile facilities | Nursing facilities/Skilled-nursing facilities | Other institutional facilities | Noninstitutionalized population | |
---|---|---|---|---|---|---|---|
36816 | Adelanto city | 2674 | 2674 | 0 | 0 | 0 | 77 |
36818 | Agoura Hills city | 148 | 0 | 0 | 148 | 0 | 0 |
36823 | Alameda city | 916 | 0 | 0 | 916 | 0 | 614 |
36825 | Albany city | 10 | 0 | 0 | 10 | 0 | 1385 |
36828 | Alhambra city | 691 | 21 | 37 | 547 | 86 | 212 |
df_k_means.shape
df_k_means.dtypes
df_k_means["Nursing facilities/Skilled-nursing facilities"].describe()
df_k_means_sub = df_k_means[(df_k_means["Nursing facilities/Skilled-nursing facilities"] < 1000) & (df_k_means["Nursing facilities/Skilled-nursing facilities"] > 0)]
plt.figure(figsize=(25,10))
splot = sns.histplot(data = df_k_means_sub, x="Nursing facilities/Skilled-nursing facilities",palette="viridis")
splot.axes.set_title("Histogram of population in Nursing facilities/Skilled-nursing facilities excluding outliers",fontsize=40)
splot.set_xlabel("Nursing facilities/Skilled-nursing facilities",fontsize=20)
splot.tick_params(labelsize=13)
X_df = df_k_means_sub[["Institutionalized population","Noninstitutionalized population"]]
X_df = X_df.dropna()
X_df.shape
X = X_df.values
from sklearn.cluster import KMeans
Error = []
for i in range(1, 12):
kmeans = KMeans(n_clusters = i)
kmeans.fit(X)
Error.append(kmeans.inertia_)
plt.plot(range(1,12), Error)
plt.title("Elbow method")
plt.xlabel("Number of clusters")
plt.ylabel("Error")
Based on Elbow method, choosing n_clusters = 4
from sklearn.cluster import KMeans
Kmean = KMeans(n_clusters=4)
y = Kmean.fit_predict(X)
plt.figsize=(25,10)
frame = plt.scatter(X[:,0], X[:,1], c=y, s=50, cmap='viridis')
frame.axes.get_xaxis().set_visible(False)