{"cells":[{"cell_type":"code","source":["import pandas as pd"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"9f62a060-cab0-4567-8958-fd6e49efbf47"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df = pd.read_csv(\"/dbfs/FileStore/tables/diamonds.csv\", header='infer')\ndf"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"9ab1b725-aee4-4bb3-8b13-41ca18176d32"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
20.23GoodEVS156.965.03274.054.072.31
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75
.................................
539350.72IdealDSI160.857.027575.755.763.50
539360.72GoodDSI163.155.027575.695.753.61
539370.70Very GoodDSI162.860.027575.665.683.56
539380.86PremiumHSI261.058.027576.156.123.74
539390.75IdealDSI262.255.027575.835.873.64
\n

53940 rows × 10 columns

\n
","textData":"
Out[2]:
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
20.23GoodEVS156.965.03274.054.072.31
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75
.................................
539350.72IdealDSI160.857.027575.755.763.50
539360.72GoodDSI163.155.027575.695.753.61
539370.70Very GoodDSI162.860.027575.665.683.56
539380.86PremiumHSI261.058.027576.156.123.74
539390.75IdealDSI262.255.027575.835.873.64
\n

53940 rows × 10 columns

\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df.head()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"826f4fc9-4e07-4e16-a51b-18cedc28cc53"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
20.23GoodEVS156.965.03274.054.072.31
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75
\n
","textData":"
Out[7]:
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
20.23GoodEVS156.965.03274.054.072.31
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75
\n
"]}}],"execution_count":0},{"cell_type":"code","source":["!pip install pandas-profiling\n!pip install ipywidgets\n!pip install matplotlib==2.2.3\n\nfrom pandas_profiling import ProfileReport\nprof = ProfileReport(df.sample(n=10000))\n#prof = ProfileReport(df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"62d03235-8fad-4a03-b38b-2034eed42632"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"code","source":["prof.to_file(output_file='output.html')\n\ndisplayHTML(prof.html)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1fe5229f-5ec6-4158-b07b-fb5076ebb18c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Pandas Profiling Report

Overview

Dataset statistics

Number of variables11
Number of observations10000
Missing cells0
Missing cells (%)0.0%
Duplicate rows0
Duplicate rows (%)0.0%
Total size in memory859.5 KiB
Average record size in memory88.0 B

Variable types

Numeric8
Categorical3

Alerts

carat is highly correlated with price and 3 other fieldsHigh correlation
price is highly correlated with carat and 3 other fieldsHigh correlation
x is highly correlated with carat and 3 other fieldsHigh correlation
y is highly correlated with carat and 3 other fieldsHigh correlation
z is highly correlated with carat and 3 other fieldsHigh correlation
carat is highly correlated with price and 3 other fieldsHigh correlation
price is highly correlated with carat and 3 other fieldsHigh correlation
x is highly correlated with carat and 3 other fieldsHigh correlation
y is highly correlated with carat and 3 other fieldsHigh correlation
z is highly correlated with carat and 3 other fieldsHigh correlation
carat is highly correlated with price and 3 other fieldsHigh correlation
price is highly correlated with carat and 3 other fieldsHigh correlation
x is highly correlated with carat and 3 other fieldsHigh correlation
y is highly correlated with carat and 3 other fieldsHigh correlation
z is highly correlated with carat and 3 other fieldsHigh correlation
df_index is highly correlated with carat and 3 other fieldsHigh correlation
carat is highly correlated with df_index and 4 other fieldsHigh correlation
cut is highly correlated with depth and 1 other fieldsHigh correlation
depth is highly correlated with cutHigh correlation
table is highly correlated with cutHigh correlation
price is highly correlated with df_index and 3 other fieldsHigh correlation
x is highly correlated with df_index and 4 other fieldsHigh correlation
y is highly correlated with carat and 2 other fieldsHigh correlation
z is highly correlated with df_index and 4 other fieldsHigh correlation
df_index has unique values Unique

Reproduction

Analysis started2021-12-09 22:36:56.032662
Analysis finished2021-12-09 22:37:11.760218
Duration15.73 seconds
Software versionpandas-profiling v3.1.0
Download configurationconfig.json

Variables

df_index
Real number (ℝ≥0)

HIGH CORRELATION
UNIQUE

Distinct10000
Distinct (%)100.0%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean27088.2074
Minimum1
Maximum53922
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum1
5-th percentile2652.65
Q113355.75
median27192
Q340817
95-th percentile51315.1
Maximum53922
Range53921
Interquartile range (IQR)27461.25

Descriptive statistics

Standard deviation15650.20796
Coefficient of variation (CV)0.5777498573
Kurtosis-1.217596069
Mean27088.2074
Median Absolute Deviation (MAD)13724
Skewness-0.01240652578
Sum270882074
Variance244929009.2
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
491521
 
< 0.1%
260991
 
< 0.1%
457631
 
< 0.1%
519081
 
< 0.1%
170931
 
< 0.1%
303271
 
< 0.1%
213721
 
< 0.1%
150501
 
< 0.1%
519161
 
< 0.1%
375831
 
< 0.1%
Other values (9990)9990
99.9%
ValueCountFrequency (%)
11
< 0.1%
21
< 0.1%
61
< 0.1%
101
< 0.1%
111
< 0.1%
171
< 0.1%
181
< 0.1%
281
< 0.1%
551
< 0.1%
591
< 0.1%
ValueCountFrequency (%)
539221
< 0.1%
539201
< 0.1%
539181
< 0.1%
539141
< 0.1%
539121
< 0.1%
539071
< 0.1%
539061
< 0.1%
539051
< 0.1%
539011
< 0.1%
538911
< 0.1%

carat
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct237
Distinct (%)2.4%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean0.791456
Minimum0.2
Maximum4.5
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum0.2
5-th percentile0.3
Q10.4
median0.7
Q31.04
95-th percentile1.71
Maximum4.5
Range4.3
Interquartile range (IQR)0.64

Descriptive statistics

Standard deviation0.4728613326
Coefficient of variation (CV)0.597457512
Kurtosis1.490228108
Mean0.791456
Median Absolute Deviation (MAD)0.32
Skewness1.162005658
Sum7914.56
Variance0.2235978398
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
0.3481
 
4.8%
0.31447
 
4.5%
1.01386
 
3.9%
0.7368
 
3.7%
0.32346
 
3.5%
1287
 
2.9%
0.9262
 
2.6%
0.4259
 
2.6%
0.71237
 
2.4%
0.5232
 
2.3%
Other values (227)6695
67.0%
ValueCountFrequency (%)
0.22
 
< 0.1%
0.214
 
< 0.1%
0.221
 
< 0.1%
0.2348
0.5%
0.2449
0.5%
0.2541
0.4%
0.2644
0.4%
0.2745
0.4%
0.2829
0.3%
0.2924
0.2%
ValueCountFrequency (%)
4.51
< 0.1%
4.011
< 0.1%
3.671
< 0.1%
3.021
< 0.1%
3.012
< 0.1%
31
< 0.1%
2.751
< 0.1%
2.741
< 0.1%
2.681
< 0.1%
2.651
< 0.1%

cut
Categorical

HIGH CORRELATION

Distinct5
Distinct (%)0.1%
Missing0
Missing (%)0.0%
Memory size78.2 KiB
Ideal
3931 
Premium
2603 
Very Good
2308 
Good
883 
Fair
 
275

Length

Max length9
Median length5
Mean length6.328
Min length4

Characters and Unicode

Total characters0
Distinct characters0
Distinct categories0 ?
Distinct scripts0 ?
Distinct blocks0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.

Unique

Unique0 ?
Unique (%)0.0%

Sample

1st rowIdeal
2nd rowVery Good
3rd rowIdeal
4th rowPremium
5th rowGood

Common Values

ValueCountFrequency (%)
Ideal3931
39.3%
Premium2603
26.0%
Very Good2308
23.1%
Good883
 
8.8%
Fair275
 
2.8%

Length

Histogram of lengths of the category

Pie chart

ValueCountFrequency (%)
ideal3931
31.9%
good3191
25.9%
premium2603
21.1%
very2308
18.8%
fair275
 
2.2%

Most occurring characters

ValueCountFrequency (%)
No values found.

Most occurring categories

ValueCountFrequency (%)
No values found.

Most frequent character per category

Most occurring scripts

ValueCountFrequency (%)
No values found.

Most frequent character per script

Most occurring blocks

ValueCountFrequency (%)
No values found.

Most frequent character per block

color
Categorical

Distinct7
Distinct (%)0.1%
Missing0
Missing (%)0.0%
Memory size78.2 KiB
G
2089 
E
1811 
F
1803 
H
1559 
D
1252 
Other values (2)
1486 

Length

Max length1
Median length1
Mean length1
Min length1

Characters and Unicode

Total characters0
Distinct characters0
Distinct categories0 ?
Distinct scripts0 ?
Distinct blocks0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.

Unique

Unique0 ?
Unique (%)0.0%

Sample

1st rowD
2nd rowE
3rd rowG
4th rowF
5th rowJ

Common Values

ValueCountFrequency (%)
G2089
20.9%
E1811
18.1%
F1803
18.0%
H1559
15.6%
D1252
12.5%
I1001
10.0%
J485
 
4.9%

Length

Histogram of lengths of the category

Pie chart

ValueCountFrequency (%)
g2089
20.9%
e1811
18.1%
f1803
18.0%
h1559
15.6%
d1252
12.5%
i1001
10.0%
j485
 
4.9%

Most occurring characters

ValueCountFrequency (%)
No values found.

Most occurring categories

ValueCountFrequency (%)
No values found.

Most frequent character per category

Most occurring scripts

ValueCountFrequency (%)
No values found.

Most frequent character per script

Most occurring blocks

ValueCountFrequency (%)
No values found.

Most frequent character per block

clarity
Categorical

Distinct8
Distinct (%)0.1%
Missing0
Missing (%)0.0%
Memory size78.2 KiB
SI1
2422 
VS2
2284 
SI2
1719 
VS1
1528 
VVS2
905 
Other values (3)
1142 

Length

Max length4
Median length3
Mean length3.1147
Min length2

Characters and Unicode

Total characters0
Distinct characters0
Distinct categories0 ?
Distinct scripts0 ?
Distinct blocks0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.

Unique

Unique0 ?
Unique (%)0.0%

Sample

1st rowVVS1
2nd rowVVS2
3rd rowVVS1
4th rowVS1
5th rowVS2

Common Values

ValueCountFrequency (%)
SI12422
24.2%
VS22284
22.8%
SI21719
17.2%
VS11528
15.3%
VVS2905
 
9.0%
VVS1692
 
6.9%
IF319
 
3.2%
I1131
 
1.3%

Length

Histogram of lengths of the category

Pie chart

ValueCountFrequency (%)
si12422
24.2%
vs22284
22.8%
si21719
17.2%
vs11528
15.3%
vvs2905
 
9.0%
vvs1692
 
6.9%
if319
 
3.2%
i1131
 
1.3%

Most occurring characters

ValueCountFrequency (%)
No values found.

Most occurring categories

ValueCountFrequency (%)
No values found.

Most frequent character per category

Most occurring scripts

ValueCountFrequency (%)
No values found.

Most frequent character per script

Most occurring blocks

ValueCountFrequency (%)
No values found.

Most frequent character per block

depth
Real number (ℝ≥0)

HIGH CORRELATION

Distinct139
Distinct (%)1.4%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean61.744
Minimum44
Maximum73.6
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum44
5-th percentile59.3
Q161.1
median61.8
Q362.5
95-th percentile63.705
Maximum73.6
Range29.6
Interquartile range (IQR)1.4

Descriptive statistics

Standard deviation1.416036199
Coefficient of variation (CV)0.02293398871
Kurtosis6.215816952
Mean61.744
Median Absolute Deviation (MAD)0.7
Skewness-0.2259760176
Sum617440
Variance2.005158516
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
62409
 
4.1%
61.9403
 
4.0%
62.2387
 
3.9%
61.6384
 
3.8%
62.1376
 
3.8%
61.8373
 
3.7%
61.7362
 
3.6%
62.3360
 
3.6%
62.4354
 
3.5%
61.5301
 
3.0%
Other values (129)6291
62.9%
ValueCountFrequency (%)
441
 
< 0.1%
52.31
 
< 0.1%
53.11
 
< 0.1%
55.21
 
< 0.1%
55.32
< 0.1%
55.52
< 0.1%
55.62
< 0.1%
55.82
< 0.1%
55.93
< 0.1%
563
< 0.1%
ValueCountFrequency (%)
73.61
 
< 0.1%
72.91
 
< 0.1%
71.21
 
< 0.1%
70.51
 
< 0.1%
70.21
 
< 0.1%
70.11
 
< 0.1%
69.51
 
< 0.1%
691
 
< 0.1%
68.93
< 0.1%
68.71
 
< 0.1%

table
Real number (ℝ≥0)

HIGH CORRELATION

Distinct90
Distinct (%)0.9%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean57.44649
Minimum43
Maximum71
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum43
5-th percentile54
Q156
median57
Q359
95-th percentile61
Maximum71
Range28
Interquartile range (IQR)3

Descriptive statistics

Standard deviation2.223800847
Coefficient of variation (CV)0.03871082197
Kurtosis1.249490598
Mean57.44649
Median Absolute Deviation (MAD)1
Skewness0.6529730234
Sum574464.9
Variance4.945290209
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
571817
18.2%
561756
17.6%
581556
15.6%
591254
12.5%
551213
12.1%
60785
7.8%
54491
 
4.9%
61404
 
4.0%
62232
 
2.3%
63111
 
1.1%
Other values (80)381
 
3.8%
ValueCountFrequency (%)
431
 
< 0.1%
491
 
< 0.1%
513
 
< 0.1%
5219
 
0.2%
52.81
 
< 0.1%
5394
0.9%
53.11
 
< 0.1%
53.41
 
< 0.1%
53.53
 
< 0.1%
53.63
 
< 0.1%
ValueCountFrequency (%)
711
 
< 0.1%
702
 
< 0.1%
693
 
< 0.1%
683
 
< 0.1%
678
 
0.1%
6615
 
0.1%
6524
 
0.2%
6449
0.5%
63111
1.1%
62.51
 
< 0.1%

price
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct5091
Distinct (%)50.9%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean3873.9908
Minimum326
Maximum18795
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum326
5-th percentile544
Q1950
median2361
Q35226
95-th percentile13016.15
Maximum18795
Range18469
Interquartile range (IQR)4276

Descriptive statistics

Standard deviation3965.051797
Coefficient of variation (CV)1.023505734
Kurtosis2.363425826
Mean3873.9908
Median Absolute Deviation (MAD)1631.5
Skewness1.663784166
Sum38739908
Variance15721635.75
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
54432
 
0.3%
62531
 
0.3%
60530
 
0.3%
77629
 
0.3%
69828
 
0.3%
82825
 
0.2%
68424
 
0.2%
72023
 
0.2%
78923
 
0.2%
55223
 
0.2%
Other values (5081)9732
97.3%
ValueCountFrequency (%)
3261
< 0.1%
3271
< 0.1%
3361
< 0.1%
3391
< 0.1%
3401
< 0.1%
3512
< 0.1%
3572
< 0.1%
3621
< 0.1%
3631
< 0.1%
3641
< 0.1%
ValueCountFrequency (%)
187951
< 0.1%
187841
< 0.1%
187811
< 0.1%
187791
< 0.1%
187661
< 0.1%
187451
< 0.1%
187411
< 0.1%
187061
< 0.1%
186781
< 0.1%
186111
< 0.1%

x
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct495
Distinct (%)5.0%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean5.716036
Minimum0
Maximum10.23
Zeros1
Zeros (%)< 0.1%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum0
5-th percentile4.29
Q14.71
median5.68
Q36.53
95-th percentile7.65
Maximum10.23
Range10.23
Interquartile range (IQR)1.82

Descriptive statistics

Standard deviation1.118243951
Coefficient of variation (CV)0.1956327691
Kurtosis-0.6066053655
Mean5.716036
Median Absolute Deviation (MAD)0.92
Skewness0.4124316947
Sum57160.36
Variance1.250469534
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
4.3492
 
0.9%
4.3592
 
0.9%
4.3984
 
0.8%
4.3681
 
0.8%
4.3375
 
0.8%
4.3274
 
0.7%
4.3173
 
0.7%
4.372
 
0.7%
4.4271
 
0.7%
4.3769
 
0.7%
Other values (485)9217
92.2%
ValueCountFrequency (%)
01
 
< 0.1%
3.812
 
< 0.1%
3.852
 
< 0.1%
3.861
 
< 0.1%
3.881
 
< 0.1%
3.893
< 0.1%
3.93
< 0.1%
3.915
0.1%
3.925
0.1%
3.936
0.1%
ValueCountFrequency (%)
10.231
< 0.1%
10.141
< 0.1%
9.861
< 0.1%
9.442
< 0.1%
9.31
< 0.1%
9.111
< 0.1%
9.041
< 0.1%
8.991
< 0.1%
8.91
< 0.1%
8.881
< 0.1%

y
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct500
Distinct (%)5.0%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean5.726097
Minimum3.77
Maximum58.9
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum3.77
5-th percentile4.3
Q14.72
median5.7
Q36.53
95-th percentile7.65
Maximum58.9
Range55.13
Interquartile range (IQR)1.81

Descriptive statistics

Standard deviation1.256195462
Coefficient of variation (CV)0.2193807514
Kurtosis338.2400271
Mean5.726097
Median Absolute Deviation (MAD)0.92
Skewness8.753843386
Sum57260.97
Variance1.578027039
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
4.3899
 
1.0%
4.3989
 
0.9%
4.3788
 
0.9%
4.3288
 
0.9%
4.3488
 
0.9%
4.3683
 
0.8%
4.3580
 
0.8%
4.3377
 
0.8%
4.4169
 
0.7%
4.3165
 
0.7%
Other values (490)9174
91.7%
ValueCountFrequency (%)
3.771
 
< 0.1%
3.781
 
< 0.1%
3.811
 
< 0.1%
3.841
 
< 0.1%
3.851
 
< 0.1%
3.861
 
< 0.1%
3.891
 
< 0.1%
3.95
0.1%
3.921
 
< 0.1%
3.933
< 0.1%
ValueCountFrequency (%)
58.91
< 0.1%
31.81
< 0.1%
10.161
< 0.1%
10.11
< 0.1%
9.811
< 0.1%
9.381
< 0.1%
9.371
< 0.1%
9.141
< 0.1%
9.021
< 0.1%
8.981
< 0.1%

z
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct328
Distinct (%)3.3%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean3.529078
Minimum0
Maximum8.06
Zeros4
Zeros (%)< 0.1%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum0
5-th percentile2.65
Q12.91
median3.51
Q34.03
95-th percentile4.7405
Maximum8.06
Range8.06
Interquartile range (IQR)1.12

Descriptive statistics

Standard deviation0.6945528852
Coefficient of variation (CV)0.1968085957
Kurtosis-0.2749503117
Mean3.529078
Median Absolute Deviation (MAD)0.56
Skewness0.3966858454
Sum35290.78
Variance0.4824037103
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
2.69166
 
1.7%
2.7151
 
1.5%
2.71139
 
1.4%
2.72126
 
1.3%
2.68125
 
1.2%
2.67119
 
1.2%
2.73115
 
1.1%
2.66103
 
1.0%
3.5599
 
1.0%
4.0198
 
1.0%
Other values (318)8759
87.6%
ValueCountFrequency (%)
04
< 0.1%
2.291
 
< 0.1%
2.31
 
< 0.1%
2.312
< 0.1%
2.322
< 0.1%
2.331
 
< 0.1%
2.351
 
< 0.1%
2.361
 
< 0.1%
2.372
< 0.1%
2.382
< 0.1%
ValueCountFrequency (%)
8.061
 
< 0.1%
6.721
 
< 0.1%
6.171
 
< 0.1%
6.131
 
< 0.1%
5.911
 
< 0.1%
5.621
 
< 0.1%
5.611
 
< 0.1%
5.61
 
< 0.1%
5.583
< 0.1%
5.571
 
< 0.1%

Interactions

Correlations

Spearman's ρ

The Spearman's rank correlation coefficient (ρ) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's r. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.

To calculate ρ for two variables X and Y, one divides the covariance of the rank variables of X and Y by the product of their standard deviations.

Pearson's r

The Pearson's correlation coefficient (r) is a measure of linear correlation between two variables. It's value lies between -1 and +1, -1 indicating total negative linear correlation, 0 indicating no linear correlation and 1 indicating total positive linear correlation. Furthermore, r is invariant under separate changes in location and scale of the two variables, implying that for a linear function the angle to the x-axis does not affect r.

To calculate r for two variables X and Y, one divides the covariance of X and Y by the product of their standard deviations.

Kendall's τ

Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (τ) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.

To calculate τ for two variables X and Y, one determines the number of concordant and discordant pairs of observations. τ is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.

Cramér's V (φc)

Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association. The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found here.

Phik (φk)

Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available here.

Missing values

A simple visualization of nullity by column.
Nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.

Sample

First rows

df_indexcaratcutcolorclaritydepthtablepricexyz
077530.64IdealDVVS161.556.042815.575.593.43
1503940.52Very GoodEVVS261.258.022545.175.193.17
2389890.36IdealGVVS162.857.010534.554.522.85
3231621.32PremiumFVS161.759.0111776.996.954.30
4314450.40GoodJVS264.057.07654.704.673.00
5132071.21Very GoodISI160.656.054576.896.974.20
6358470.31IdealFVVS261.655.09174.354.382.69
7229871.56Very GoodHVS263.160.0110397.437.344.66
8460190.53GoodEVS263.755.017275.155.123.27
9514970.70FairFVS165.859.023815.585.483.64

Last rows

df_indexcaratcutcolorclaritydepthtablepricexyz
9990313570.27IdealFIF63.055.07604.134.162.61
9991481690.79GoodFSI264.360.019435.825.753.72
9992246292.01GoodISI259.159.0129648.098.144.80
9993381450.30Very GoodGVVS163.557.010134.274.242.70
9994124651.03IdealDSI162.356.052496.516.454.04
9995153720.27Very GoodEVVS261.260.06064.164.172.55
9996188101.06GoodFVS158.557.076996.686.733.92
999765641.00GoodESI261.564.040776.296.223.85
999828640.72IdealDVS260.557.032755.815.833.52
999986881.07IdealHVS262.157.044586.536.494.04
","textData":"
\rSummarize dataset: 0%| | 0/5 [00:00<?, ?it/s]\rSummarize dataset: 0%| | 0/16 [00:00<?, ?it/s, Describe variable:table]\rSummarize dataset: 6%|▋ | 1/16 [00:00<00:05, 2.89it/s, Describe variable:table]\rSummarize dataset: 6%|▋ | 1/16 [00:00<00:05, 2.89it/s, Describe variable:depth]\rSummarize dataset: 12%|█▎ | 2/16 [00:00<00:02, 4.88it/s, Describe variable:depth]\rSummarize dataset: 12%|█▎ | 2/16 [00:00<00:02, 4.88it/s, Describe variable:y] \rSummarize dataset: 19%|█▉ | 3/16 [00:00<00:02, 4.88it/s, Describe variable:x]\rSummarize dataset: 25%|██▌ | 4/16 [00:00<00:02, 4.88it/s, Describe variable:carat]\rSummarize dataset: 31%|███▏ | 5/16 [00:00<00:02, 4.88it/s, Describe variable:z] \rSummarize dataset: 38%|███▊ | 6/16 [00:00<00:02, 4.88it/s, Describe variable:df_index]\rSummarize dataset: 44%|████▍ | 7/16 [00:00<00:01, 4.88it/s, Describe variable:price] \rSummarize dataset: 50%|█████ | 8/16 [00:00<00:01, 4.88it/s, Describe variable:color]\rSummarize dataset: 56%|█████▋ | 9/16 [00:00<00:00, 19.55it/s, Describe variable:color]\rSummarize dataset: 56%|█████▋ | 9/16 [00:00<00:00, 19.55it/s, Describe variable:cut] \rSummarize dataset: 62%|██████▎ | 10/16 [00:00<00:00, 19.55it/s, Describe variable:clarity]\rSummarize dataset: 69%|██████▉ | 11/16 [00:00<00:00, 19.55it/s, Get variable types] \rSummarize dataset: 55%|█████▍ | 12/22 [00:00<00:00, 19.55it/s, Calculate spearman correlation]\rSummarize dataset: 59%|█████▉ | 13/22 [00:00<00:00, 19.55it/s, Calculate pearson correlation] \rSummarize dataset: 64%|██████▎ | 14/22 [00:00<00:00, 19.55it/s, Calculate kendall correlation]\rSummarize dataset: 68%|██████▊ | 15/22 [00:00<00:00, 29.27it/s, Calculate kendall correlation]\rSummarize dataset: 68%|██████▊ | 15/22 [00:00<00:00, 29.27it/s, Calculate cramers correlation]\rSummarize dataset: 73%|███████▎ | 16/22 [00:00<00:00, 29.27it/s, Calculate phi_k correlation] \rSummarize dataset: 77%|███████▋ | 17/22 [00:02<00:00, 29.27it/s, Get scatter matrix] \rSummarize dataset: 20%|█▉ | 17/86 [00:02<00:02, 29.27it/s, scatter df_index, df_index]\rSummarize dataset: 21%|██ | 18/86 [00:02<00:02, 29.27it/s, scatter carat, df_index] \rSummarize dataset: 22%|██▏ | 19/86 [00:03<00:14, 4.77it/s, scatter carat, df_index]\rSummarize dataset: 22%|██▏ | 19/86 [00:03<00:14, 4.77it/s, scatter depth, df_index]\rSummarize dataset: 23%|██▎ | 20/86 [00:03<00:13, 4.77it/s, scatter table, df_index]\rSummarize dataset: 24%|██▍ | 21/86 [00:03<00:13, 4.77it/s, scatter price, df_index]\rSummarize dataset: 26%|██▌ | 22/86 [00:03<00:13, 4.87it/s, scatter price, df_index]\rSummarize dataset: 26%|██▌ | 22/86 [00:03<00:13, 4.87it/s, scatter x, df_index] \rSummarize dataset: 27%|██▋ | 23/86 [00:03<00:12, 4.87it/s, scatter y, df_index]\rSummarize dataset: 28%|██▊ | 24/86 [00:03<00:12, 4.93it/s, scatter y, df_index]\rSummarize dataset: 28%|██▊ | 24/86 [00:03<00:12, 4.93it/s, scatter z, df_index]\rSummarize dataset: 29%|██▉ | 25/86 [00:04<00:12, 4.93it/s, scatter df_index, carat]\rSummarize dataset: 30%|███ | 26/86 [00:04<00:12, 4.96it/s, scatter df_index, carat]\rSummarize dataset: 30%|███ | 26/86 [00:04<00:12, 4.96it/s, scatter carat, carat] \rSummarize dataset: 31%|███▏ | 27/86 [00:04<00:11, 4.96it/s, scatter depth, carat]\rSummarize dataset: 33%|███▎ | 28/86 [00:04<00:11, 5.01it/s, scatter depth, carat]\rSummarize dataset: 33%|███▎ | 28/86 [00:04<00:11, 5.01it/s, scatter table, carat]\rSummarize dataset: 34%|███▎ | 29/86 [00:04<00:11, 5.05it/s, scatter table, carat]\rSummarize dataset: 34%|███▎ | 29/86 [00:04<00:11, 5.05it/s, scatter price, carat]\rSummarize dataset: 35%|███▍ | 30/86 [00:05<00:11, 5.05it/s, scatter price, carat]\rSummarize dataset: 35%|███▍ | 30/86 [00:05<00:11, 5.05it/s, scatter x, carat] \rSummarize dataset: 36%|███▌ | 31/86 [00:05<00:10, 5.08it/s, scatter x, carat]\rSummarize dataset: 36%|███▌ | 31/86 [00:05<00:10, 5.08it/s, scatter y, carat]\rSummarize dataset: 37%|███▋ | 32/86 [00:05<00:10, 5.14it/s, scatter y, carat]\rSummarize dataset: 37%|███▋ | 32/86 [00:05<00:10, 5.14it/s, scatter z, carat]\rSummarize dataset: 38%|███▊ | 33/86 [00:05<00:10, 5.11it/s, scatter z, carat]\rSummarize dataset: 38%|███▊ | 33/86 [00:05<00:10, 5.11it/s, scatter df_index, depth]\rSummarize dataset: 40%|███▉ | 34/86 [00:05<00:10, 5.12it/s, scatter df_index, depth]\rSummarize dataset: 40%|███▉ | 34/86 [00:05<00:10, 5.12it/s, scatter carat, depth] \rSummarize dataset: 41%|████ | 35/86 [00:06<00:09, 5.14it/s, scatter carat, depth]\rSummarize dataset: 41%|████ | 35/86 [00:06<00:09, 5.14it/s, scatter depth, depth]\rSummarize dataset: 42%|████▏ | 36/86 [00:06<00:09, 5.10it/s, scatter depth, depth]\rSummarize dataset: 42%|████▏ | 36/86 [00:06<00:09, 5.10it/s, scatter table, depth]\rSummarize dataset: 43%|████▎ | 37/86 [00:06<00:09, 5.13it/s, scatter table, depth]\rSummarize dataset: 43%|████▎ | 37/86 [00:06<00:09, 5.13it/s, scatter price, depth]\rSummarize dataset: 44%|████▍ | 38/86 [00:06<00:09, 5.04it/s, scatter price, depth]\rSummarize dataset: 44%|████▍ | 38/86 [00:06<00:09, 5.04it/s, scatter x, depth] \rSummarize dataset: 45%|████▌ | 39/86 [00:06<00:09, 5.11it/s, scatter x, depth]\rSummarize dataset: 45%|████▌ | 39/86 [00:06<00:09, 5.11it/s, scatter y, depth]\rSummarize dataset: 47%|████▋ | 40/86 [00:07<00:08, 5.13it/s, scatter y, depth]\rSummarize dataset: 47%|████▋ | 40/86 [00:07<00:08, 5.13it/s, scatter z, depth]\rSummarize dataset: 48%|████▊ | 41/86 [00:07<00:08, 5.09it/s, scatter z, depth]\rSummarize dataset: 48%|████▊ | 41/86 [00:07<00:08, 5.09it/s, scatter df_index, table]\rSummarize dataset: 49%|████▉ | 42/86 [00:07<00:08, 5.14it/s, scatter df_index, table]\rSummarize dataset: 49%|████▉ | 42/86 [00:07<00:08, 5.14it/s, scatter carat, table] \rSummarize dataset: 50%|█████ | 43/86 [00:07<00:08, 5.22it/s, scatter carat, table]\rSummarize dataset: 50%|█████ | 43/86 [00:07<00:08, 5.22it/s, scatter depth, table]\rSummarize dataset: 51%|█████ | 44/86 [00:07<00:08, 5.15it/s, scatter depth, table]\rSummarize dataset: 51%|█████ | 44/86 [00:07<00:08, 5.15it/s, scatter table, table]\rSummarize dataset: 52%|█████▏ | 45/86 [00:08<00:07, 5.18it/s, scatter table, table]\rSummarize dataset: 52%|█████▏ | 45/86 [00:08<00:07, 5.18it/s, scatter price, table]\rSummarize dataset: 53%|█████▎ | 46/86 [00:08<00:07, 5.04it/s, scatter price, table]\rSummarize dataset: 53%|█████▎ | 46/86 [00:08<00:07, 5.04it/s, scatter x, table] \rSummarize dataset: 55%|█████▍ | 47/86 [00:08<00:07, 5.01it/s, scatter x, table]\rSummarize dataset: 55%|█████▍ | 47/86 [00:08<00:07, 5.01it/s, scatter y, table]\rSummarize dataset: 56%|█████▌ | 48/86 [00:08<00:09, 4.20it/s, scatter y, table]\rSummarize dataset: 56%|█████▌ | 48/86 [00:08<00:09, 4.20it/s, scatter z, table]\rSummarize dataset: 57%|█████▋ | 49/86 [00:08<00:08, 4.45it/s, scatter z, table]\rSummarize dataset: 57%|█████▋ | 49/86 [00:08<00:08, 4.45it/s, scatter df_index, price]\rSummarize dataset: 58%|█████▊ | 50/86 [00:09<00:07, 4.52it/s, scatter df_index, price]\rSummarize dataset: 58%|█████▊ | 50/86 [00:09<00:07, 4.52it/s, scatter carat, price] \rSummarize dataset: 59%|█████▉ | 51/86 [00:09<00:07, 4.68it/s, scatter carat, price]\rSummarize dataset: 59%|█████▉ | 51/86 [00:09<00:07, 4.68it/s, scatter depth, price]\rSummarize dataset: 60%|██████ | 52/86 [00:09<00:07, 4.70it/s, scatter depth, price]\rSummarize dataset: 60%|██████ | 52/86 [00:09<00:07, 4.70it/s, scatter table, price]\rSummarize dataset: 62%|██████▏ | 53/86 [00:09<00:06, 4.78it/s, scatter table, price]\rSummarize dataset: 62%|██████▏ | 53/86 [00:09<00:06, 4.78it/s, scatter price, price]\rSummarize dataset: 63%|██████▎ | 54/86 [00:10<00:06, 4.79it/s, scatter price, price]\rSummarize dataset: 63%|██████▎ | 54/86 [00:10<00:06, 4.79it/s, scatter x, price] \rSummarize dataset: 64%|██████▍ | 55/86 [00:10<00:06, 4.82it/s, scatter x, price]\rSummarize dataset: 64%|██████▍ | 55/86 [00:10<00:06, 4.82it/s, scatter y, price]\rSummarize dataset: 65%|██████▌ | 56/86 [00:10<00:06, 4.88it/s, scatter y, price]\rSummarize dataset: 65%|██████▌ | 56/86 [00:10<00:06, 4.88it/s, scatter z, price]\rSummarize dataset: 66%|██████▋ | 57/86 [00:10<00:06, 4.80it/s, scatter z, price]\rSummarize dataset: 66%|██████▋ | 57/86 [00:10<00:06, 4.80it/s, scatter df_index, x]\rSummarize dataset: 67%|██████▋ | 58/86 [00:10<00:05, 4.98it/s, scatter df_index, x]\rSummarize dataset: 67%|██████▋ | 58/86 [00:10<00:05, 4.98it/s, scatter carat, x] \rSummarize dataset: 69%|██████▊ | 59/86 [00:10<00:05, 5.17it/s, scatter carat, x]\rSummarize dataset: 69%|██████▊ | 59/86 [00:10<00:05, 5.17it/s, scatter depth, x]\rSummarize dataset: 70%|██████▉ | 60/86 [00:11<00:05, 5.08it/s, scatter depth, x]\rSummarize dataset: 70%|██████▉ | 60/86 [00:11<00:05, 5.08it/s, scatter table, x]\rSummarize dataset: 71%|███████ | 61/86 [00:11<00:04, 5.23it/s, scatter table, x]\rSummarize dataset: 71%|███████ | 61/86 [00:11<00:04, 5.23it/s, scatter price, x]\rSummarize dataset: 72%|███████▏ | 62/86 [00:11<00:04, 5.27it/s, scatter price, x]\rSummarize dataset: 72%|███████▏ | 62/86 [00:11<00:04, 5.27it/s, scatter x, x] \rSummarize dataset: 73%|███████▎ | 63/86 [00:11<00:04, 5.35it/s, scatter x, x]\rSummarize dataset: 73%|███████▎ | 63/86 [00:11<00:04, 5.35it/s, scatter y, x]\rSummarize dataset: 74%|███████▍ | 64/86 [00:11<00:04, 5.41it/s, scatter y, x]\rSummarize dataset: 74%|███████▍ | 64/86 [00:11<00:04, 5.41it/s, scatter z, x]\rSummarize dataset: 76%|███████▌ | 65/86 [00:12<00:03, 5.39it/s, scatter z, x]\rSummarize dataset: 76%|███████▌ | 65/86 [00:12<00:03, 5.39it/s, scatter df_index, y]\rSummarize dataset: 77%|███████▋ | 66/86 [00:12<00:03, 5.48it/s, scatter df_index, y]\rSummarize dataset: 77%|███████▋ | 66/86 [00:12<00:03, 5.48it/s, scatter carat, y] \rSummarize dataset: 78%|███████▊ | 67/86 [00:12<00:03, 5.57it/s, scatter carat, y]\rSummarize dataset: 78%|███████▊ | 67/86 [00:12<00:03, 5.57it/s, scatter depth, y]\rSummarize dataset: 79%|███████▉ | 68/86 [00:12<00:03, 5.50it/s, scatter depth, y]\rSummarize dataset: 79%|███████▉ | 68/86 [00:12<00:03, 5.50it/s, scatter table, y]\rSummarize dataset: 80%|████████ | 69/86 [00:12<00:03, 5.55it/s, scatter table, y]\rSummarize dataset: 80%|████████ | 69/86 [00:12<00:03, 5.55it/s, scatter price, y]\rSummarize dataset: 81%|████████▏ | 70/86 [00:13<00:02, 5.43it/s, scatter price, y]\rSummarize dataset: 81%|████████▏ | 70/86 [00:13<00:02, 5.43it/s, scatter x, y] \rSummarize dataset: 83%|████████▎ | 71/86 [00:13<00:02, 5.38it/s, scatter x, y]\rSummarize dataset: 83%|████████▎ | 71/86 [00:13<00:02, 5.38it/s, scatter y, y]\rSummarize dataset: 84%|████████▎ | 72/86 [00:13<00:02, 5.40it/s, scatter y, y]\rSummarize dataset: 84%|████████▎ | 72/86 [00:13<00:02, 5.40it/s, scatter z, y]\rSummarize dataset: 85%|████████▍ | 73/86 [00:13<00:02, 5.39it/s, scatter z, y]\rSummarize dataset: 85%|████████▍ | 73/86 [00:13<00:02, 5.39it/s, scatter df_index, z]\rSummarize dataset: 86%|████████▌ | 74/86 [00:13<00:02, 5.36it/s, scatter df_index, z]\rSummarize dataset: 86%|████████▌ | 74/86 [00:13<00:02, 5.36it/s, scatter carat, z] \rSummarize dataset: 87%|████████▋ | 75/86 [00:13<00:02, 5.40it/s, scatter carat, z]\rSummarize dataset: 87%|████████▋ | 75/86 [00:13<00:02, 5.40it/s, scatter depth, z]\rSummarize dataset: 88%|████████▊ | 76/86 [00:14<00:01, 5.35it/s, scatter depth, z]\rSummarize dataset: 88%|████████▊ | 76/86 [00:14<00:01, 5.35it/s, scatter table, z]\rSummarize dataset: 90%|████████▉ | 77/86 [00:14<00:02, 4.42it/s, scatter table, z]\rSummarize dataset: 90%|████████▉ | 77/86 [00:14<00:02, 4.42it/s, scatter price, z]\rSummarize dataset: 91%|█████████ | 78/86 [00:14<00:01, 4.59it/s, scatter price, z]\rSummarize dataset: 91%|█████████ | 78/86 [00:14<00:01, 4.59it/s, scatter x, z] \rSummarize dataset: 92%|█████████▏| 79/86 [00:14<00:01, 4.82it/s, scatter x, z]\rSummarize dataset: 92%|█████████▏| 79/86 [00:14<00:01, 4.82it/s, scatter y, z]\rSummarize dataset: 93%|█████████▎| 80/86 [00:15<00:01, 4.98it/s, scatter y, z]\rSummarize dataset: 93%|█████████▎| 80/86 [00:15<00:01, 4.98it/s, scatter z, z]\rSummarize dataset: 94%|█████████▍| 81/86 [00:15<00:00, 5.01it/s, scatter z, z]\rSummarize dataset: 94%|█████████▍| 81/86 [00:15<00:00, 5.01it/s, Get dataframe statistics]\rSummarize dataset: 93%|█████████▎| 82/88 [00:15<00:01, 5.01it/s, Missing diagram bar] \rSummarize dataset: 94%|█████████▍| 83/88 [00:15<00:00, 5.25it/s, Missing diagram bar]\rSummarize dataset: 94%|█████████▍| 83/88 [00:15<00:00, 5.25it/s, Missing diagram matrix]\rSummarize dataset: 95%|█████████▌| 84/88 [00:15<00:00, 5.76it/s, Missing diagram matrix]\rSummarize dataset: 95%|█████████▌| 84/88 [00:15<00:00, 5.76it/s, Take sample] \rSummarize dataset: 97%|█████████▋| 85/88 [00:15<00:00, 5.76it/s, Detecting duplicates]\rSummarize dataset: 98%|█████████▊| 86/88 [00:15<00:00, 5.76it/s, Get alerts] \rSummarize dataset: 99%|█████████▉| 87/88 [00:15<00:00, 5.76it/s, Get reproduction details]\rSummarize dataset: 100%|██████████| 88/88 [00:15<00:00, 5.76it/s, Completed] \rSummarize dataset: 100%|██████████| 88/88 [00:15<00:00, 5.60it/s, Completed]\n\rGenerate report structure: 0%| | 0/1 [00:00<?, ?it/s]\rGenerate report structure: 100%|██████████| 1/1 [00:05<00:00, 5.49s/it]\rGenerate report structure: 100%|██████████| 1/1 [00:05<00:00, 5.49s/it]\n\rRender HTML: 0%| | 0/1 [00:00<?, ?it/s]\rRender HTML: 100%|██████████| 1/1 [00:02<00:00, 2.41s/it]\rRender HTML: 100%|██████████| 1/1 [00:02<00:00, 2.41s/it]\n\rExport report to file: 0%| | 0/1 [00:00<?, ?it/s]\rExport report to file: 100%|██████████| 1/1 [00:00<00:00, 94.40it/s]\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["Pandas Profiling Report

Overview

Dataset statistics

Number of variables11
Number of observations10000
Missing cells0
Missing cells (%)0.0%
Duplicate rows0
Duplicate rows (%)0.0%
Total size in memory859.5 KiB
Average record size in memory88.0 B

Variable types

Numeric8
Categorical3

Alerts

carat is highly correlated with price and 3 other fieldsHigh correlation
price is highly correlated with carat and 3 other fieldsHigh correlation
x is highly correlated with carat and 3 other fieldsHigh correlation
y is highly correlated with carat and 3 other fieldsHigh correlation
z is highly correlated with carat and 3 other fieldsHigh correlation
carat is highly correlated with price and 3 other fieldsHigh correlation
price is highly correlated with carat and 3 other fieldsHigh correlation
x is highly correlated with carat and 3 other fieldsHigh correlation
y is highly correlated with carat and 3 other fieldsHigh correlation
z is highly correlated with carat and 3 other fieldsHigh correlation
carat is highly correlated with price and 3 other fieldsHigh correlation
price is highly correlated with carat and 3 other fieldsHigh correlation
x is highly correlated with carat and 3 other fieldsHigh correlation
y is highly correlated with carat and 3 other fieldsHigh correlation
z is highly correlated with carat and 3 other fieldsHigh correlation
df_index is highly correlated with carat and 3 other fieldsHigh correlation
carat is highly correlated with df_index and 4 other fieldsHigh correlation
cut is highly correlated with depth and 1 other fieldsHigh correlation
depth is highly correlated with cutHigh correlation
table is highly correlated with cutHigh correlation
price is highly correlated with df_index and 3 other fieldsHigh correlation
x is highly correlated with df_index and 4 other fieldsHigh correlation
y is highly correlated with carat and 2 other fieldsHigh correlation
z is highly correlated with df_index and 4 other fieldsHigh correlation
df_index has unique values Unique

Reproduction

Analysis started2021-12-09 22:36:56.032662
Analysis finished2021-12-09 22:37:11.760218
Duration15.73 seconds
Software versionpandas-profiling v3.1.0
Download configurationconfig.json

Variables

df_index
Real number (ℝ≥0)

HIGH CORRELATION
UNIQUE

Distinct10000
Distinct (%)100.0%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean27088.2074
Minimum1
Maximum53922
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum1
5-th percentile2652.65
Q113355.75
median27192
Q340817
95-th percentile51315.1
Maximum53922
Range53921
Interquartile range (IQR)27461.25

Descriptive statistics

Standard deviation15650.20796
Coefficient of variation (CV)0.5777498573
Kurtosis-1.217596069
Mean27088.2074
Median Absolute Deviation (MAD)13724
Skewness-0.01240652578
Sum270882074
Variance244929009.2
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
491521
 
< 0.1%
260991
 
< 0.1%
457631
 
< 0.1%
519081
 
< 0.1%
170931
 
< 0.1%
303271
 
< 0.1%
213721
 
< 0.1%
150501
 
< 0.1%
519161
 
< 0.1%
375831
 
< 0.1%
Other values (9990)9990
99.9%
ValueCountFrequency (%)
11
< 0.1%
21
< 0.1%
61
< 0.1%
101
< 0.1%
111
< 0.1%
171
< 0.1%
181
< 0.1%
281
< 0.1%
551
< 0.1%
591
< 0.1%
ValueCountFrequency (%)
539221
< 0.1%
539201
< 0.1%
539181
< 0.1%
539141
< 0.1%
539121
< 0.1%
539071
< 0.1%
539061
< 0.1%
539051
< 0.1%
539011
< 0.1%
538911
< 0.1%

carat
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct237
Distinct (%)2.4%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean0.791456
Minimum0.2
Maximum4.5
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum0.2
5-th percentile0.3
Q10.4
median0.7
Q31.04
95-th percentile1.71
Maximum4.5
Range4.3
Interquartile range (IQR)0.64

Descriptive statistics

Standard deviation0.4728613326
Coefficient of variation (CV)0.597457512
Kurtosis1.490228108
Mean0.791456
Median Absolute Deviation (MAD)0.32
Skewness1.162005658
Sum7914.56
Variance0.2235978398
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
0.3481
 
4.8%
0.31447
 
4.5%
1.01386
 
3.9%
0.7368
 
3.7%
0.32346
 
3.5%
1287
 
2.9%
0.9262
 
2.6%
0.4259
 
2.6%
0.71237
 
2.4%
0.5232
 
2.3%
Other values (227)6695
67.0%
ValueCountFrequency (%)
0.22
 
< 0.1%
0.214
 
< 0.1%
0.221
 
< 0.1%
0.2348
0.5%
0.2449
0.5%
0.2541
0.4%
0.2644
0.4%
0.2745
0.4%
0.2829
0.3%
0.2924
0.2%
ValueCountFrequency (%)
4.51
< 0.1%
4.011
< 0.1%
3.671
< 0.1%
3.021
< 0.1%
3.012
< 0.1%
31
< 0.1%
2.751
< 0.1%
2.741
< 0.1%
2.681
< 0.1%
2.651
< 0.1%

cut
Categorical

HIGH CORRELATION

Distinct5
Distinct (%)0.1%
Missing0
Missing (%)0.0%
Memory size78.2 KiB
Ideal
3931 
Premium
2603 
Very Good
2308 
Good
883 
Fair
 
275

Length

Max length9
Median length5
Mean length6.328
Min length4

Characters and Unicode

Total characters0
Distinct characters0
Distinct categories0 ?
Distinct scripts0 ?
Distinct blocks0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.

Unique

Unique0 ?
Unique (%)0.0%

Sample

1st rowIdeal
2nd rowVery Good
3rd rowIdeal
4th rowPremium
5th rowGood

Common Values

ValueCountFrequency (%)
Ideal3931
39.3%
Premium2603
26.0%
Very Good2308
23.1%
Good883
 
8.8%
Fair275
 
2.8%

Length

Histogram of lengths of the category

Pie chart

ValueCountFrequency (%)
ideal3931
31.9%
good3191
25.9%
premium2603
21.1%
very2308
18.8%
fair275
 
2.2%

Most occurring characters

ValueCountFrequency (%)
No values found.

Most occurring categories

ValueCountFrequency (%)
No values found.

Most frequent character per category

Most occurring scripts

ValueCountFrequency (%)
No values found.

Most frequent character per script

Most occurring blocks

ValueCountFrequency (%)
No values found.

Most frequent character per block

color
Categorical

Distinct7
Distinct (%)0.1%
Missing0
Missing (%)0.0%
Memory size78.2 KiB
G
2089 
E
1811 
F
1803 
H
1559 
D
1252 
Other values (2)
1486 

Length

Max length1
Median length1
Mean length1
Min length1

Characters and Unicode

Total characters0
Distinct characters0
Distinct categories0 ?
Distinct scripts0 ?
Distinct blocks0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.

Unique

Unique0 ?
Unique (%)0.0%

Sample

1st rowD
2nd rowE
3rd rowG
4th rowF
5th rowJ

Common Values

ValueCountFrequency (%)
G2089
20.9%
E1811
18.1%
F1803
18.0%
H1559
15.6%
D1252
12.5%
I1001
10.0%
J485
 
4.9%

Length

Histogram of lengths of the category

Pie chart

ValueCountFrequency (%)
g2089
20.9%
e1811
18.1%
f1803
18.0%
h1559
15.6%
d1252
12.5%
i1001
10.0%
j485
 
4.9%

Most occurring characters

ValueCountFrequency (%)
No values found.

Most occurring categories

ValueCountFrequency (%)
No values found.

Most frequent character per category

Most occurring scripts

ValueCountFrequency (%)
No values found.

Most frequent character per script

Most occurring blocks

ValueCountFrequency (%)
No values found.

Most frequent character per block

clarity
Categorical

Distinct8
Distinct (%)0.1%
Missing0
Missing (%)0.0%
Memory size78.2 KiB
SI1
2422 
VS2
2284 
SI2
1719 
VS1
1528 
VVS2
905 
Other values (3)
1142 

Length

Max length4
Median length3
Mean length3.1147
Min length2

Characters and Unicode

Total characters0
Distinct characters0
Distinct categories0 ?
Distinct scripts0 ?
Distinct blocks0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.

Unique

Unique0 ?
Unique (%)0.0%

Sample

1st rowVVS1
2nd rowVVS2
3rd rowVVS1
4th rowVS1
5th rowVS2

Common Values

ValueCountFrequency (%)
SI12422
24.2%
VS22284
22.8%
SI21719
17.2%
VS11528
15.3%
VVS2905
 
9.0%
VVS1692
 
6.9%
IF319
 
3.2%
I1131
 
1.3%

Length

Histogram of lengths of the category

Pie chart

ValueCountFrequency (%)
si12422
24.2%
vs22284
22.8%
si21719
17.2%
vs11528
15.3%
vvs2905
 
9.0%
vvs1692
 
6.9%
if319
 
3.2%
i1131
 
1.3%

Most occurring characters

ValueCountFrequency (%)
No values found.

Most occurring categories

ValueCountFrequency (%)
No values found.

Most frequent character per category

Most occurring scripts

ValueCountFrequency (%)
No values found.

Most frequent character per script

Most occurring blocks

ValueCountFrequency (%)
No values found.

Most frequent character per block

depth
Real number (ℝ≥0)

HIGH CORRELATION

Distinct139
Distinct (%)1.4%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean61.744
Minimum44
Maximum73.6
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum44
5-th percentile59.3
Q161.1
median61.8
Q362.5
95-th percentile63.705
Maximum73.6
Range29.6
Interquartile range (IQR)1.4

Descriptive statistics

Standard deviation1.416036199
Coefficient of variation (CV)0.02293398871
Kurtosis6.215816952
Mean61.744
Median Absolute Deviation (MAD)0.7
Skewness-0.2259760176
Sum617440
Variance2.005158516
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
62409
 
4.1%
61.9403
 
4.0%
62.2387
 
3.9%
61.6384
 
3.8%
62.1376
 
3.8%
61.8373
 
3.7%
61.7362
 
3.6%
62.3360
 
3.6%
62.4354
 
3.5%
61.5301
 
3.0%
Other values (129)6291
62.9%
ValueCountFrequency (%)
441
 
< 0.1%
52.31
 
< 0.1%
53.11
 
< 0.1%
55.21
 
< 0.1%
55.32
< 0.1%
55.52
< 0.1%
55.62
< 0.1%
55.82
< 0.1%
55.93
< 0.1%
563
< 0.1%
ValueCountFrequency (%)
73.61
 
< 0.1%
72.91
 
< 0.1%
71.21
 
< 0.1%
70.51
 
< 0.1%
70.21
 
< 0.1%
70.11
 
< 0.1%
69.51
 
< 0.1%
691
 
< 0.1%
68.93
< 0.1%
68.71
 
< 0.1%

table
Real number (ℝ≥0)

HIGH CORRELATION

Distinct90
Distinct (%)0.9%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean57.44649
Minimum43
Maximum71
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum43
5-th percentile54
Q156
median57
Q359
95-th percentile61
Maximum71
Range28
Interquartile range (IQR)3

Descriptive statistics

Standard deviation2.223800847
Coefficient of variation (CV)0.03871082197
Kurtosis1.249490598
Mean57.44649
Median Absolute Deviation (MAD)1
Skewness0.6529730234
Sum574464.9
Variance4.945290209
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
571817
18.2%
561756
17.6%
581556
15.6%
591254
12.5%
551213
12.1%
60785
7.8%
54491
 
4.9%
61404
 
4.0%
62232
 
2.3%
63111
 
1.1%
Other values (80)381
 
3.8%
ValueCountFrequency (%)
431
 
< 0.1%
491
 
< 0.1%
513
 
< 0.1%
5219
 
0.2%
52.81
 
< 0.1%
5394
0.9%
53.11
 
< 0.1%
53.41
 
< 0.1%
53.53
 
< 0.1%
53.63
 
< 0.1%
ValueCountFrequency (%)
711
 
< 0.1%
702
 
< 0.1%
693
 
< 0.1%
683
 
< 0.1%
678
 
0.1%
6615
 
0.1%
6524
 
0.2%
6449
0.5%
63111
1.1%
62.51
 
< 0.1%

price
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct5091
Distinct (%)50.9%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean3873.9908
Minimum326
Maximum18795
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum326
5-th percentile544
Q1950
median2361
Q35226
95-th percentile13016.15
Maximum18795
Range18469
Interquartile range (IQR)4276

Descriptive statistics

Standard deviation3965.051797
Coefficient of variation (CV)1.023505734
Kurtosis2.363425826
Mean3873.9908
Median Absolute Deviation (MAD)1631.5
Skewness1.663784166
Sum38739908
Variance15721635.75
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
54432
 
0.3%
62531
 
0.3%
60530
 
0.3%
77629
 
0.3%
69828
 
0.3%
82825
 
0.2%
68424
 
0.2%
72023
 
0.2%
78923
 
0.2%
55223
 
0.2%
Other values (5081)9732
97.3%
ValueCountFrequency (%)
3261
< 0.1%
3271
< 0.1%
3361
< 0.1%
3391
< 0.1%
3401
< 0.1%
3512
< 0.1%
3572
< 0.1%
3621
< 0.1%
3631
< 0.1%
3641
< 0.1%
ValueCountFrequency (%)
187951
< 0.1%
187841
< 0.1%
187811
< 0.1%
187791
< 0.1%
187661
< 0.1%
187451
< 0.1%
187411
< 0.1%
187061
< 0.1%
186781
< 0.1%
186111
< 0.1%

x
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct495
Distinct (%)5.0%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean5.716036
Minimum0
Maximum10.23
Zeros1
Zeros (%)< 0.1%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum0
5-th percentile4.29
Q14.71
median5.68
Q36.53
95-th percentile7.65
Maximum10.23
Range10.23
Interquartile range (IQR)1.82

Descriptive statistics

Standard deviation1.118243951
Coefficient of variation (CV)0.1956327691
Kurtosis-0.6066053655
Mean5.716036
Median Absolute Deviation (MAD)0.92
Skewness0.4124316947
Sum57160.36
Variance1.250469534
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
4.3492
 
0.9%
4.3592
 
0.9%
4.3984
 
0.8%
4.3681
 
0.8%
4.3375
 
0.8%
4.3274
 
0.7%
4.3173
 
0.7%
4.372
 
0.7%
4.4271
 
0.7%
4.3769
 
0.7%
Other values (485)9217
92.2%
ValueCountFrequency (%)
01
 
< 0.1%
3.812
 
< 0.1%
3.852
 
< 0.1%
3.861
 
< 0.1%
3.881
 
< 0.1%
3.893
< 0.1%
3.93
< 0.1%
3.915
0.1%
3.925
0.1%
3.936
0.1%
ValueCountFrequency (%)
10.231
< 0.1%
10.141
< 0.1%
9.861
< 0.1%
9.442
< 0.1%
9.31
< 0.1%
9.111
< 0.1%
9.041
< 0.1%
8.991
< 0.1%
8.91
< 0.1%
8.881
< 0.1%

y
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct500
Distinct (%)5.0%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean5.726097
Minimum3.77
Maximum58.9
Zeros0
Zeros (%)0.0%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum3.77
5-th percentile4.3
Q14.72
median5.7
Q36.53
95-th percentile7.65
Maximum58.9
Range55.13
Interquartile range (IQR)1.81

Descriptive statistics

Standard deviation1.256195462
Coefficient of variation (CV)0.2193807514
Kurtosis338.2400271
Mean5.726097
Median Absolute Deviation (MAD)0.92
Skewness8.753843386
Sum57260.97
Variance1.578027039
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
4.3899
 
1.0%
4.3989
 
0.9%
4.3788
 
0.9%
4.3288
 
0.9%
4.3488
 
0.9%
4.3683
 
0.8%
4.3580
 
0.8%
4.3377
 
0.8%
4.4169
 
0.7%
4.3165
 
0.7%
Other values (490)9174
91.7%
ValueCountFrequency (%)
3.771
 
< 0.1%
3.781
 
< 0.1%
3.811
 
< 0.1%
3.841
 
< 0.1%
3.851
 
< 0.1%
3.861
 
< 0.1%
3.891
 
< 0.1%
3.95
0.1%
3.921
 
< 0.1%
3.933
< 0.1%
ValueCountFrequency (%)
58.91
< 0.1%
31.81
< 0.1%
10.161
< 0.1%
10.11
< 0.1%
9.811
< 0.1%
9.381
< 0.1%
9.371
< 0.1%
9.141
< 0.1%
9.021
< 0.1%
8.981
< 0.1%

z
Real number (ℝ≥0)

HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION

Distinct328
Distinct (%)3.3%
Missing0
Missing (%)0.0%
Infinite0
Infinite (%)0.0%
Mean3.529078
Minimum0
Maximum8.06
Zeros4
Zeros (%)< 0.1%
Negative0
Negative (%)0.0%
Memory size78.2 KiB

Quantile statistics

Minimum0
5-th percentile2.65
Q12.91
median3.51
Q34.03
95-th percentile4.7405
Maximum8.06
Range8.06
Interquartile range (IQR)1.12

Descriptive statistics

Standard deviation0.6945528852
Coefficient of variation (CV)0.1968085957
Kurtosis-0.2749503117
Mean3.529078
Median Absolute Deviation (MAD)0.56
Skewness0.3966858454
Sum35290.78
Variance0.4824037103
MonotonicityNot monotonic
Histogram with fixed size bins (bins=50)
ValueCountFrequency (%)
2.69166
 
1.7%
2.7151
 
1.5%
2.71139
 
1.4%
2.72126
 
1.3%
2.68125
 
1.2%
2.67119
 
1.2%
2.73115
 
1.1%
2.66103
 
1.0%
3.5599
 
1.0%
4.0198
 
1.0%
Other values (318)8759
87.6%
ValueCountFrequency (%)
04
< 0.1%
2.291
 
< 0.1%
2.31
 
< 0.1%
2.312
< 0.1%
2.322
< 0.1%
2.331
 
< 0.1%
2.351
 
< 0.1%
2.361
 
< 0.1%
2.372
< 0.1%
2.382
< 0.1%
ValueCountFrequency (%)
8.061
 
< 0.1%
6.721
 
< 0.1%
6.171
 
< 0.1%
6.131
 
< 0.1%
5.911
 
< 0.1%
5.621
 
< 0.1%
5.611
 
< 0.1%
5.61
 
< 0.1%
5.583
< 0.1%
5.571
 
< 0.1%

Interactions

Correlations

Spearman's ρ

The Spearman's rank correlation coefficient (ρ) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's r. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.

To calculate ρ for two variables X and Y, one divides the covariance of the rank variables of X and Y by the product of their standard deviations.

Pearson's r

The Pearson's correlation coefficient (r) is a measure of linear correlation between two variables. It's value lies between -1 and +1, -1 indicating total negative linear correlation, 0 indicating no linear correlation and 1 indicating total positive linear correlation. Furthermore, r is invariant under separate changes in location and scale of the two variables, implying that for a linear function the angle to the x-axis does not affect r.

To calculate r for two variables X and Y, one divides the covariance of X and Y by the product of their standard deviations.

Kendall's τ

Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (τ) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.

To calculate τ for two variables X and Y, one determines the number of concordant and discordant pairs of observations. τ is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.

Cramér's V (φc)

Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association. The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found here.

Phik (φk)

Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available here.

Missing values

A simple visualization of nullity by column.
Nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.

Sample

First rows

df_indexcaratcutcolorclaritydepthtablepricexyz
077530.64IdealDVVS161.556.042815.575.593.43
1503940.52Very GoodEVVS261.258.022545.175.193.17
2389890.36IdealGVVS162.857.010534.554.522.85
3231621.32PremiumFVS161.759.0111776.996.954.30
4314450.40GoodJVS264.057.07654.704.673.00
5132071.21Very GoodISI160.656.054576.896.974.20
6358470.31IdealFVVS261.655.09174.354.382.69
7229871.56Very GoodHVS263.160.0110397.437.344.66
8460190.53GoodEVS263.755.017275.155.123.27
9514970.70FairFVS165.859.023815.585.483.64

Last rows

df_indexcaratcutcolorclaritydepthtablepricexyz
9990313570.27IdealFIF63.055.07604.134.162.61
9991481690.79GoodFSI264.360.019435.825.753.72
9992246292.01GoodISI259.159.0129648.098.144.80
9993381450.30Very GoodGVVS163.557.010134.274.242.70
9994124651.03IdealDSI162.356.052496.516.454.04
9995153720.27Very GoodEVVS261.260.06064.164.172.55
9996188101.06GoodFVS158.557.076996.686.733.92
999765641.00GoodESI261.564.040776.296.223.85
999828640.72IdealDVS260.557.032755.815.833.52
999986881.07IdealHVS262.157.044586.536.494.04
"]}}],"execution_count":0},{"cell_type":"code","source":["df = spark.table(\"bling\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"deb29a46-ad1b-4c5e-9cb6-7089e52a6d60"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[{"name":"df","typeStr":"pyspark.sql.dataframe.DataFrame","schema":{"fields":[{"metadata":{},"name":"_c0","nullable":true,"type":"integer"},{"metadata":{},"name":"carat","nullable":true,"type":"double"},{"metadata":{},"name":"cut","nullable":true,"type":"string"},{"metadata":{},"name":"color","nullable":true,"type":"string"},{"metadata":{},"name":"clarity","nullable":true,"type":"string"},{"metadata":{},"name":"depth","nullable":true,"type":"double"},{"metadata":{},"name":"table","nullable":true,"type":"double"},{"metadata":{},"name":"price","nullable":true,"type":"integer"},{"metadata":{},"name":"x","nullable":true,"type":"double"},{"metadata":{},"name":"y","nullable":true,"type":"double"},{"metadata":{},"name":"z","nullable":true,"type":"double"}],"type":"struct"},"tableIdentifier":"dbfs:/delta/diamonds"}],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"code","source":["#first lets build the method\nimport pandas as pd\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.functions import isnan, when, count, col\n\ndef dataprofile(data_all_df,data_cols):\n data_df = data_all_df.select(data_cols)\n columns2Bprofiled = data_df.columns\n global schema_name, table_name\n if not 'schema_name' in globals():\n schema_name = 'schema_name'\n if not 'table_name' in globals():\n table_name = 'table_name' \n dprof_df = pd.DataFrame({'schema_name':[schema_name] * len(data_df.columns),\\\n 'table_name':[table_name] * len(data_df.columns),\\\n 'column_names':data_df.columns,\\\n 'data_types':[x[1] for x in data_df.dtypes]}) \n dprof_df = dprof_df[['schema_name','table_name','column_names', 'data_types']]\n #dprof_df.set_index('column_names', inplace=True, drop=False)\n # ======================\n num_rows = data_df.count()\n dprof_df['num_rows'] = num_rows\n # ====================== \n # number of rows with nulls and nans \n df_nacounts = data_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data_df.columns \\\n if data_df.select(c).dtypes[0][1]!='timestamp']).toPandas().transpose()\n df_nacounts = df_nacounts.reset_index() \n df_nacounts.columns = ['column_names','num_null']\n dprof_df = pd.merge(dprof_df, df_nacounts, on = ['column_names'], how = 'left')\n # ========================\n # number of rows with white spaces (one or more space) or blanks\n num_spaces = [data_df.where(F.col(c).rlike('^\\\\s+$')).count() for c in data_df.columns]\n dprof_df['num_spaces'] = num_spaces\n num_blank = [data_df.where(F.col(c)=='').count() for c in data_df.columns]\n dprof_df['num_blank'] = num_blank\n # =========================\n # using the in built describe() function \n desc_df = data_df.describe().toPandas().transpose()\n desc_df.columns = ['count', 'mean', 'stddev', 'min', 'max']\n desc_df = desc_df.iloc[1:,:] \n desc_df = desc_df.reset_index() \n desc_df.columns.values[0] = 'column_names' \n desc_df = desc_df[['column_names','count', 'mean', 'stddev']] \n dprof_df = pd.merge(dprof_df, desc_df , on = ['column_names'], how = 'left')\n # ===========================================\n try:\n allminvalues = [data_df.select(F.min(x)).limit(1).toPandas().iloc[0][0] for x in columns2Bprofiled]\n allmaxvalues = [data_df.select(F.max(x)).limit(1).toPandas().iloc[0][0] for x in columns2Bprofiled]\n except:\n allminvalues = None\n allmaxvalues = None\n try:\n allmincounts = None\n allmaxcounts = None\n except:\n print('')\n \n df_counts = dprof_df[['column_names']]\n df_counts.insert(loc=0, column='min', value=allminvalues)\n df_counts.insert(loc=0, column='counts_min', value=allmincounts)\n df_counts.insert(loc=0, column='max', value=allmaxvalues)\n df_counts.insert(loc=0, column='counts_max', value=allmaxcounts)\n df_counts = df_counts[['column_names','min','counts_min','max','counts_max']]\n dprof_df = pd.merge(dprof_df, df_counts , on = ['column_names'], how = 'left') \n # ==========================================\n # number of distinct values in each column\n dprof_df['num_distinct'] = [data_df.select(x).distinct().count() for x in columns2Bprofiled]\n # ============================================\n # most frequently occuring value in a column and its count\n dprof_df['most_freq_valwcount'] = [data_df.groupBy(x).count().sort(\"count\",ascending=False).limit(1).\\\n toPandas().iloc[0].values.tolist() for x in columns2Bprofiled]\n dprof_df['most_freq_value'] = [x[0] for x in dprof_df['most_freq_valwcount']]\n dprof_df['most_freq_value_count'] = [x[1] for x in dprof_df['most_freq_valwcount']]\n dprof_df = dprof_df.drop(['most_freq_valwcount'],axis=1)\n # least frequently occuring value in a column and its count\n dprof_df['least_freq_valwcount'] = [data_df.groupBy(x).count().sort(\"count\",ascending=True).limit(1).\\\n toPandas().iloc[0].values.tolist() for x in columns2Bprofiled]\n dprof_df['least_freq_value'] = [x[0] for x in dprof_df['least_freq_valwcount']]\n dprof_df['least_freq_value_count'] = [x[1] for x in dprof_df['least_freq_valwcount']]\n dprof_df = dprof_df.drop(['least_freq_valwcount'],axis=1)\n\n return dprof_df\n \nprint('done')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"193ff84b-1826-49b3-8a34-2e2a5baef5f2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
done\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
done\n
"]}}],"execution_count":0},{"cell_type":"code","source":["dc = df.columns\ndp = dataprofile(df,dc)\ndp"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e16af345-88a4-4683-84d2-906f5af6d200"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
schema_nametable_namecolumn_namesdata_typesnum_rowsnum_nullnum_spacesnum_blankcountmeanstddevmincounts_minmaxcounts_maxnum_distinctmost_freq_valuemost_freq_value_countleast_freq_valueleast_freq_value_count
0schema_nametable_name_c0int539400005394026970.515571.2810969425371None53940None539401481.01481.0
1schema_nametable_namecaratdouble53940000539400.79793974786798520.47401124440541960.2None5.01None2730.32604.03.021.0
2schema_nametable_namecutstring5394000053940NoneNoneFairNoneVery GoodNone5Ideal21551.0Fair1610.0
3schema_nametable_namecolorstring5394000053940NoneNoneDNoneJNone7G11292.0J2808.0
4schema_nametable_nameclaritystring5394000053940NoneNoneI1NoneVVS2None8SI113065.0I1741.0
5schema_nametable_namedepthdouble539400005394061.749404894326241.432621318833652543.0None79.0None18462.02239.053.31.0
6schema_nametable_nametabledouble539400005394057.457183908046032.234490562821324743.0None95.0None12756.09881.064.21.0
7schema_nametable_namepriceint53940000539403932.7997219132373989.439738146397326None18823None11602605132.045191.0
8schema_nametable_namexdouble53940000539405.7311572117166091.12176074679249150.0None10.74None5544.37448.09.131.0
9schema_nametable_nameydouble53940000539405.7345259547644621.14213467412356160.0None58.9None5524.34437.058.91.0
10schema_nametable_namezdouble53940000539403.53873377827233160.70569884694998830.0None31.8None3752.7767.05.861.0
\n
","textData":"
Out[5]:
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
schema_nametable_namecolumn_namesdata_typesnum_rowsnum_nullnum_spacesnum_blankcountmeanstddevmincounts_minmaxcounts_maxnum_distinctmost_freq_valuemost_freq_value_countleast_freq_valueleast_freq_value_count
0schema_nametable_name_c0int539400005394026970.515571.2810969425371None53940None539401481.01481.0
1schema_nametable_namecaratdouble53940000539400.79793974786798520.47401124440541960.2None5.01None2730.32604.03.021.0
2schema_nametable_namecutstring5394000053940NoneNoneFairNoneVery GoodNone5Ideal21551.0Fair1610.0
3schema_nametable_namecolorstring5394000053940NoneNoneDNoneJNone7G11292.0J2808.0
4schema_nametable_nameclaritystring5394000053940NoneNoneI1NoneVVS2None8SI113065.0I1741.0
5schema_nametable_namedepthdouble539400005394061.749404894326241.432621318833652543.0None79.0None18462.02239.053.31.0
6schema_nametable_nametabledouble539400005394057.457183908046032.234490562821324743.0None95.0None12756.09881.064.21.0
7schema_nametable_namepriceint53940000539403932.7997219132373989.439738146397326None18823None11602605132.045191.0
8schema_nametable_namexdouble53940000539405.7311572117166091.12176074679249150.0None10.74None5544.37448.09.131.0
9schema_nametable_nameydouble53940000539405.7345259547644621.14213467412356160.0None58.9None5524.34437.058.91.0
10schema_nametable_namezdouble53940000539403.53873377827233160.70569884694998830.0None31.8None3752.7767.05.861.0
\n
"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b6a4e85d-8213-4fcc-99d4-3c6a7a82c7b0"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"Pandas_Profiling_Test","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":7032705}},"nbformat":4,"nbformat_minor":0}