{"cells":[{"cell_type":"code","source":["import pandas as pd"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"9f62a060-cab0-4567-8958-fd6e49efbf47"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df = pd.read_csv(\"/dbfs/FileStore/tables/diamonds.csv\", header='infer')\ndf"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"9ab1b725-aee4-4bb3-8b13-41ca18176d32"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"\n\n
\n \n \n \n carat \n cut \n color \n clarity \n depth \n table \n price \n x \n y \n z \n \n \n \n \n 0 \n 0.23 \n Ideal \n E \n SI2 \n 61.5 \n 55.0 \n 326 \n 3.95 \n 3.98 \n 2.43 \n \n \n 1 \n 0.21 \n Premium \n E \n SI1 \n 59.8 \n 61.0 \n 326 \n 3.89 \n 3.84 \n 2.31 \n \n \n 2 \n 0.23 \n Good \n E \n VS1 \n 56.9 \n 65.0 \n 327 \n 4.05 \n 4.07 \n 2.31 \n \n \n 3 \n 0.29 \n Premium \n I \n VS2 \n 62.4 \n 58.0 \n 334 \n 4.20 \n 4.23 \n 2.63 \n \n \n 4 \n 0.31 \n Good \n J \n SI2 \n 63.3 \n 58.0 \n 335 \n 4.34 \n 4.35 \n 2.75 \n \n \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n \n \n 53935 \n 0.72 \n Ideal \n D \n SI1 \n 60.8 \n 57.0 \n 2757 \n 5.75 \n 5.76 \n 3.50 \n \n \n 53936 \n 0.72 \n Good \n D \n SI1 \n 63.1 \n 55.0 \n 2757 \n 5.69 \n 5.75 \n 3.61 \n \n \n 53937 \n 0.70 \n Very Good \n D \n SI1 \n 62.8 \n 60.0 \n 2757 \n 5.66 \n 5.68 \n 3.56 \n \n \n 53938 \n 0.86 \n Premium \n H \n SI2 \n 61.0 \n 58.0 \n 2757 \n 6.15 \n 6.12 \n 3.74 \n \n \n 53939 \n 0.75 \n Ideal \n D \n SI2 \n 62.2 \n 55.0 \n 2757 \n 5.83 \n 5.87 \n 3.64 \n \n \n
\n
53940 rows × 10 columns
\n
","textData":"Out[2]:
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n\n
\n \n \n \n carat \n cut \n color \n clarity \n depth \n table \n price \n x \n y \n z \n \n \n \n \n 0 \n 0.23 \n Ideal \n E \n SI2 \n 61.5 \n 55.0 \n 326 \n 3.95 \n 3.98 \n 2.43 \n \n \n 1 \n 0.21 \n Premium \n E \n SI1 \n 59.8 \n 61.0 \n 326 \n 3.89 \n 3.84 \n 2.31 \n \n \n 2 \n 0.23 \n Good \n E \n VS1 \n 56.9 \n 65.0 \n 327 \n 4.05 \n 4.07 \n 2.31 \n \n \n 3 \n 0.29 \n Premium \n I \n VS2 \n 62.4 \n 58.0 \n 334 \n 4.20 \n 4.23 \n 2.63 \n \n \n 4 \n 0.31 \n Good \n J \n SI2 \n 63.3 \n 58.0 \n 335 \n 4.34 \n 4.35 \n 2.75 \n \n \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n ... \n \n \n 53935 \n 0.72 \n Ideal \n D \n SI1 \n 60.8 \n 57.0 \n 2757 \n 5.75 \n 5.76 \n 3.50 \n \n \n 53936 \n 0.72 \n Good \n D \n SI1 \n 63.1 \n 55.0 \n 2757 \n 5.69 \n 5.75 \n 3.61 \n \n \n 53937 \n 0.70 \n Very Good \n D \n SI1 \n 62.8 \n 60.0 \n 2757 \n 5.66 \n 5.68 \n 3.56 \n \n \n 53938 \n 0.86 \n Premium \n H \n SI2 \n 61.0 \n 58.0 \n 2757 \n 6.15 \n 6.12 \n 3.74 \n \n \n 53939 \n 0.75 \n Ideal \n D \n SI2 \n 62.2 \n 55.0 \n 2757 \n 5.83 \n 5.87 \n 3.64 \n \n \n
\n
53940 rows × 10 columns
\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df.head()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"826f4fc9-4e07-4e16-a51b-18cedc28cc53"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"\n\n
\n \n \n \n carat \n cut \n color \n clarity \n depth \n table \n price \n x \n y \n z \n \n \n \n \n 0 \n 0.23 \n Ideal \n E \n SI2 \n 61.5 \n 55.0 \n 326 \n 3.95 \n 3.98 \n 2.43 \n \n \n 1 \n 0.21 \n Premium \n E \n SI1 \n 59.8 \n 61.0 \n 326 \n 3.89 \n 3.84 \n 2.31 \n \n \n 2 \n 0.23 \n Good \n E \n VS1 \n 56.9 \n 65.0 \n 327 \n 4.05 \n 4.07 \n 2.31 \n \n \n 3 \n 0.29 \n Premium \n I \n VS2 \n 62.4 \n 58.0 \n 334 \n 4.20 \n 4.23 \n 2.63 \n \n \n 4 \n 0.31 \n Good \n J \n SI2 \n 63.3 \n 58.0 \n 335 \n 4.34 \n 4.35 \n 2.75 \n \n \n
\n
","textData":"Out[7]:
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n\n
\n \n \n \n carat \n cut \n color \n clarity \n depth \n table \n price \n x \n y \n z \n \n \n \n \n 0 \n 0.23 \n Ideal \n E \n SI2 \n 61.5 \n 55.0 \n 326 \n 3.95 \n 3.98 \n 2.43 \n \n \n 1 \n 0.21 \n Premium \n E \n SI1 \n 59.8 \n 61.0 \n 326 \n 3.89 \n 3.84 \n 2.31 \n \n \n 2 \n 0.23 \n Good \n E \n VS1 \n 56.9 \n 65.0 \n 327 \n 4.05 \n 4.07 \n 2.31 \n \n \n 3 \n 0.29 \n Premium \n I \n VS2 \n 62.4 \n 58.0 \n 334 \n 4.20 \n 4.23 \n 2.63 \n \n \n 4 \n 0.31 \n Good \n J \n SI2 \n 63.3 \n 58.0 \n 335 \n 4.34 \n 4.35 \n 2.75 \n \n \n
\n
"]}}],"execution_count":0},{"cell_type":"code","source":["!pip install pandas-profiling\n!pip install ipywidgets\n!pip install matplotlib==2.2.3\n\nfrom pandas_profiling import ProfileReport\nprof = ProfileReport(df.sample(n=10000))\n#prof = ProfileReport(df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"62d03235-8fad-4a03-b38b-2034eed42632"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"code","source":["prof.to_file(output_file='output.html')\n\ndisplayHTML(prof.html)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1fe5229f-5ec6-4158-b07b-fb5076ebb18c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Pandas Profiling Report Dataset statistics
Number of variables 11 Number of observations 10000 Missing cells 0 Missing cells (%) 0.0% Duplicate rows 0 Duplicate rows (%) 0.0% Total size in memory 859.5 KiB Average record size in memory 88.0 B
Alerts
carat
is highly correlated with price
and 3 other fields High correlation price
is highly correlated with carat
and 3 other fields High correlation x
is highly correlated with carat
and 3 other fields High correlation y
is highly correlated with carat
and 3 other fields High correlation z
is highly correlated with carat
and 3 other fields High correlation carat
is highly correlated with price
and 3 other fields High correlation price
is highly correlated with carat
and 3 other fields High correlation x
is highly correlated with carat
and 3 other fields High correlation y
is highly correlated with carat
and 3 other fields High correlation z
is highly correlated with carat
and 3 other fields High correlation carat
is highly correlated with price
and 3 other fields High correlation price
is highly correlated with carat
and 3 other fields High correlation x
is highly correlated with carat
and 3 other fields High correlation y
is highly correlated with carat
and 3 other fields High correlation z
is highly correlated with carat
and 3 other fields High correlation df_index
is highly correlated with carat
and 3 other fields High correlation carat
is highly correlated with df_index
and 4 other fields High correlation cut
is highly correlated with depth
and 1 other fields High correlation depth
is highly correlated with cut
High correlation table
is highly correlated with cut
High correlation price
is highly correlated with df_index
and 3 other fields High correlation x
is highly correlated with df_index
and 4 other fields High correlation y
is highly correlated with carat
and 2 other fields High correlation z
is highly correlated with df_index
and 4 other fields High correlation df_index
has unique values Unique
Reproduction
Analysis started 2021-12-09 22:36:56.032662 Analysis finished 2021-12-09 22:37:11.760218 Duration 15.73 seconds Software version pandas-profiling v3.1.0 Download configuration config.json
df_index Real number (ℝ≥0 )
HIGH CORRELATION
UNIQUE
Distinct 10000 Distinct (%) 100.0% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 27088.2074
Minimum 1 Maximum 53922 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 1 5-th percentile 2652.65 Q1 13355.75 median 27192 Q3 40817 95-th percentile 51315.1 Maximum 53922 Range 53921 Interquartile range (IQR) 27461.25
Descriptive statistics
Standard deviation 15650.20796 Coefficient of variation (CV) 0.5777498573 Kurtosis -1.217596069 Mean 27088.2074 Median Absolute Deviation (MAD) 13724 Skewness -0.01240652578 Sum 270882074 Variance 244929009.2 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 49152 1
< 0.1% 26099 1
< 0.1% 45763 1
< 0.1% 51908 1
< 0.1% 17093 1
< 0.1% 30327 1
< 0.1% 21372 1
< 0.1% 15050 1
< 0.1% 51916 1
< 0.1% 37583 1
< 0.1% Other values (9990) 9990 99.9%
Value Count Frequency (%) 1 1 < 0.1%
2 1 < 0.1%
6 1 < 0.1%
10 1 < 0.1%
11 1 < 0.1%
17 1 < 0.1%
18 1 < 0.1%
28 1 < 0.1%
55 1 < 0.1%
59 1 < 0.1%
Value Count Frequency (%) 53922 1 < 0.1%
53920 1 < 0.1%
53918 1 < 0.1%
53914 1 < 0.1%
53912 1 < 0.1%
53907 1 < 0.1%
53906 1 < 0.1%
53905 1 < 0.1%
53901 1 < 0.1%
53891 1 < 0.1%
carat Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 237 Distinct (%) 2.4% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 0.791456
Minimum 0.2 Maximum 4.5 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 0.2 5-th percentile 0.3 Q1 0.4 median 0.7 Q3 1.04 95-th percentile 1.71 Maximum 4.5 Range 4.3 Interquartile range (IQR) 0.64
Descriptive statistics
Standard deviation 0.4728613326 Coefficient of variation (CV) 0.597457512 Kurtosis 1.490228108 Mean 0.791456 Median Absolute Deviation (MAD) 0.32 Skewness 1.162005658 Sum 7914.56 Variance 0.2235978398 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 0.3 481
4.8% 0.31 447
4.5% 1.01 386
3.9% 0.7 368
3.7% 0.32 346
3.5% 1 287
2.9% 0.9 262
2.6% 0.4 259
2.6% 0.71 237
2.4% 0.5 232
2.3% Other values (227) 6695 67.0%
Value Count Frequency (%) 0.2 2
< 0.1% 0.21 4
< 0.1% 0.22 1
< 0.1% 0.23 48 0.5%
0.24 49 0.5%
0.25 41 0.4%
0.26 44 0.4%
0.27 45 0.4%
0.28 29 0.3%
0.29 24 0.2%
Value Count Frequency (%) 4.5 1 < 0.1%
4.01 1 < 0.1%
3.67 1 < 0.1%
3.02 1 < 0.1%
3.01 2 < 0.1%
3 1 < 0.1%
2.75 1 < 0.1%
2.74 1 < 0.1%
2.68 1 < 0.1%
2.65 1 < 0.1%
cut Categorical
HIGH CORRELATION
Distinct 5 Distinct (%) 0.1% Missing 0 Missing (%) 0.0% Memory size 78.2 KiB
Ideal 3931
Premium 2603
Very Good 2308
Good 883
Fair
275
Toggle details
Length
Max length 9 Median length 5 Mean length 6.328 Min length 4
Characters and Unicode
Total characters 0 Distinct characters 0 Distinct categories 0 ? Distinct scripts 0 ? Distinct blocks 0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.
Sample
1st row Ideal 2nd row Very Good 3rd row Ideal 4th row Premium 5th row Good
Common Values Value Count Frequency (%) Ideal 3931 39.3%
Premium 2603 26.0%
Very Good 2308 23.1%
Good 883
8.8% Fair 275
2.8%
Length Histogram of lengths of the category
Value Count Frequency (%) ideal 3931 31.9%
good 3191 25.9%
premium 2603 21.1%
very 2308 18.8%
fair 275
2.2%
Most occurring characters Value Count Frequency (%) No values found.
Most occurring categories Value Count Frequency (%) No values found.
Most frequent character per category Most occurring scripts Value Count Frequency (%) No values found.
Most frequent character per script Most occurring blocks Value Count Frequency (%) No values found.
Most frequent character per block Distinct 7 Distinct (%) 0.1% Missing 0 Missing (%) 0.0% Memory size 78.2 KiB
G 2089
E 1811
F 1803
H 1559
D 1252
Other values (2) 1486
Toggle details
Length
Max length 1 Median length 1 Mean length 1 Min length 1
Characters and Unicode
Total characters 0 Distinct characters 0 Distinct categories 0 ? Distinct scripts 0 ? Distinct blocks 0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.
Sample
1st row D 2nd row E 3rd row G 4th row F 5th row J
Common Values Value Count Frequency (%) G 2089 20.9%
E 1811 18.1%
F 1803 18.0%
H 1559 15.6%
D 1252 12.5%
I 1001 10.0%
J 485
4.9%
Length Histogram of lengths of the category
Value Count Frequency (%) g 2089 20.9%
e 1811 18.1%
f 1803 18.0%
h 1559 15.6%
d 1252 12.5%
i 1001 10.0%
j 485
4.9%
Most occurring characters Value Count Frequency (%) No values found.
Most occurring categories Value Count Frequency (%) No values found.
Most frequent character per category Most occurring scripts Value Count Frequency (%) No values found.
Most frequent character per script Most occurring blocks Value Count Frequency (%) No values found.
Most frequent character per block Distinct 8 Distinct (%) 0.1% Missing 0 Missing (%) 0.0% Memory size 78.2 KiB
SI1 2422
VS2 2284
SI2 1719
VS1 1528
VVS2 905
Other values (3) 1142
Toggle details
Length
Max length 4 Median length 3 Mean length 3.1147 Min length 2
Characters and Unicode
Total characters 0 Distinct characters 0 Distinct categories 0 ? Distinct scripts 0 ? Distinct blocks 0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.
Sample
1st row VVS1 2nd row VVS2 3rd row VVS1 4th row VS1 5th row VS2
Common Values Value Count Frequency (%) SI1 2422 24.2%
VS2 2284 22.8%
SI2 1719 17.2%
VS1 1528 15.3%
VVS2 905
9.0% VVS1 692
6.9% IF 319
3.2% I1 131
1.3%
Length Histogram of lengths of the category
Value Count Frequency (%) si1 2422 24.2%
vs2 2284 22.8%
si2 1719 17.2%
vs1 1528 15.3%
vvs2 905
9.0% vvs1 692
6.9% if 319
3.2% i1 131
1.3%
Most occurring characters Value Count Frequency (%) No values found.
Most occurring categories Value Count Frequency (%) No values found.
Most frequent character per category Most occurring scripts Value Count Frequency (%) No values found.
Most frequent character per script Most occurring blocks Value Count Frequency (%) No values found.
Most frequent character per block depth Real number (ℝ≥0 )
HIGH CORRELATION
Distinct 139 Distinct (%) 1.4% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 61.744
Minimum 44 Maximum 73.6 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 44 5-th percentile 59.3 Q1 61.1 median 61.8 Q3 62.5 95-th percentile 63.705 Maximum 73.6 Range 29.6 Interquartile range (IQR) 1.4
Descriptive statistics
Standard deviation 1.416036199 Coefficient of variation (CV) 0.02293398871 Kurtosis 6.215816952 Mean 61.744 Median Absolute Deviation (MAD) 0.7 Skewness -0.2259760176 Sum 617440 Variance 2.005158516 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 62 409
4.1% 61.9 403
4.0% 62.2 387
3.9% 61.6 384
3.8% 62.1 376
3.8% 61.8 373
3.7% 61.7 362
3.6% 62.3 360
3.6% 62.4 354
3.5% 61.5 301
3.0% Other values (129) 6291 62.9%
Value Count Frequency (%) 44 1
< 0.1% 52.3 1
< 0.1% 53.1 1
< 0.1% 55.2 1
< 0.1% 55.3 2 < 0.1%
55.5 2 < 0.1%
55.6 2 < 0.1%
55.8 2 < 0.1%
55.9 3 < 0.1%
56 3 < 0.1%
Value Count Frequency (%) 73.6 1
< 0.1% 72.9 1
< 0.1% 71.2 1
< 0.1% 70.5 1
< 0.1% 70.2 1
< 0.1% 70.1 1
< 0.1% 69.5 1
< 0.1% 69 1
< 0.1% 68.9 3 < 0.1%
68.7 1
< 0.1%
table Real number (ℝ≥0 )
HIGH CORRELATION
Distinct 90 Distinct (%) 0.9% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 57.44649
Minimum 43 Maximum 71 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 43 5-th percentile 54 Q1 56 median 57 Q3 59 95-th percentile 61 Maximum 71 Range 28 Interquartile range (IQR) 3
Descriptive statistics
Standard deviation 2.223800847 Coefficient of variation (CV) 0.03871082197 Kurtosis 1.249490598 Mean 57.44649 Median Absolute Deviation (MAD) 1 Skewness 0.6529730234 Sum 574464.9 Variance 4.945290209 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 57 1817 18.2%
56 1756 17.6%
58 1556 15.6%
59 1254 12.5%
55 1213 12.1%
60 785 7.8%
54 491
4.9% 61 404
4.0% 62 232
2.3% 63 111
1.1% Other values (80) 381
3.8%
Value Count Frequency (%) 43 1
< 0.1% 49 1
< 0.1% 51 3
< 0.1% 52 19
0.2% 52.8 1
< 0.1% 53 94 0.9%
53.1 1
< 0.1% 53.4 1
< 0.1% 53.5 3
< 0.1% 53.6 3
< 0.1%
Value Count Frequency (%) 71 1
< 0.1% 70 2
< 0.1% 69 3
< 0.1% 68 3
< 0.1% 67 8
0.1% 66 15
0.1% 65 24
0.2% 64 49 0.5%
63 111 1.1%
62.5 1
< 0.1%
price Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 5091 Distinct (%) 50.9% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 3873.9908
Minimum 326 Maximum 18795 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 326 5-th percentile 544 Q1 950 median 2361 Q3 5226 95-th percentile 13016.15 Maximum 18795 Range 18469 Interquartile range (IQR) 4276
Descriptive statistics
Standard deviation 3965.051797 Coefficient of variation (CV) 1.023505734 Kurtosis 2.363425826 Mean 3873.9908 Median Absolute Deviation (MAD) 1631.5 Skewness 1.663784166 Sum 38739908 Variance 15721635.75 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 544 32
0.3% 625 31
0.3% 605 30
0.3% 776 29
0.3% 698 28
0.3% 828 25
0.2% 684 24
0.2% 720 23
0.2% 789 23
0.2% 552 23
0.2% Other values (5081) 9732 97.3%
Value Count Frequency (%) 326 1 < 0.1%
327 1 < 0.1%
336 1 < 0.1%
339 1 < 0.1%
340 1 < 0.1%
351 2 < 0.1%
357 2 < 0.1%
362 1 < 0.1%
363 1 < 0.1%
364 1 < 0.1%
Value Count Frequency (%) 18795 1 < 0.1%
18784 1 < 0.1%
18781 1 < 0.1%
18779 1 < 0.1%
18766 1 < 0.1%
18745 1 < 0.1%
18741 1 < 0.1%
18706 1 < 0.1%
18678 1 < 0.1%
18611 1 < 0.1%
x Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 495 Distinct (%) 5.0% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 5.716036
Minimum 0 Maximum 10.23 Zeros 1 Zeros (%) < 0.1% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 0 5-th percentile 4.29 Q1 4.71 median 5.68 Q3 6.53 95-th percentile 7.65 Maximum 10.23 Range 10.23 Interquartile range (IQR) 1.82
Descriptive statistics
Standard deviation 1.118243951 Coefficient of variation (CV) 0.1956327691 Kurtosis -0.6066053655 Mean 5.716036 Median Absolute Deviation (MAD) 0.92 Skewness 0.4124316947 Sum 57160.36 Variance 1.250469534 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 4.34 92
0.9% 4.35 92
0.9% 4.39 84
0.8% 4.36 81
0.8% 4.33 75
0.8% 4.32 74
0.7% 4.31 73
0.7% 4.3 72
0.7% 4.42 71
0.7% 4.37 69
0.7% Other values (485) 9217 92.2%
Value Count Frequency (%) 0 1
< 0.1% 3.81 2
< 0.1% 3.85 2
< 0.1% 3.86 1
< 0.1% 3.88 1
< 0.1% 3.89 3 < 0.1%
3.9 3 < 0.1%
3.91 5 0.1%
3.92 5 0.1%
3.93 6 0.1%
Value Count Frequency (%) 10.23 1 < 0.1%
10.14 1 < 0.1%
9.86 1 < 0.1%
9.44 2 < 0.1%
9.3 1 < 0.1%
9.11 1 < 0.1%
9.04 1 < 0.1%
8.99 1 < 0.1%
8.9 1 < 0.1%
8.88 1 < 0.1%
y Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 500 Distinct (%) 5.0% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 5.726097
Minimum 3.77 Maximum 58.9 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 3.77 5-th percentile 4.3 Q1 4.72 median 5.7 Q3 6.53 95-th percentile 7.65 Maximum 58.9 Range 55.13 Interquartile range (IQR) 1.81
Descriptive statistics
Standard deviation 1.256195462 Coefficient of variation (CV) 0.2193807514 Kurtosis 338.2400271 Mean 5.726097 Median Absolute Deviation (MAD) 0.92 Skewness 8.753843386 Sum 57260.97 Variance 1.578027039 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 4.38 99
1.0% 4.39 89
0.9% 4.37 88
0.9% 4.32 88
0.9% 4.34 88
0.9% 4.36 83
0.8% 4.35 80
0.8% 4.33 77
0.8% 4.41 69
0.7% 4.31 65
0.7% Other values (490) 9174 91.7%
Value Count Frequency (%) 3.77 1
< 0.1% 3.78 1
< 0.1% 3.81 1
< 0.1% 3.84 1
< 0.1% 3.85 1
< 0.1% 3.86 1
< 0.1% 3.89 1
< 0.1% 3.9 5 0.1%
3.92 1
< 0.1% 3.93 3 < 0.1%
Value Count Frequency (%) 58.9 1 < 0.1%
31.8 1 < 0.1%
10.16 1 < 0.1%
10.1 1 < 0.1%
9.81 1 < 0.1%
9.38 1 < 0.1%
9.37 1 < 0.1%
9.14 1 < 0.1%
9.02 1 < 0.1%
8.98 1 < 0.1%
z Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 328 Distinct (%) 3.3% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 3.529078
Minimum 0 Maximum 8.06 Zeros 4 Zeros (%) < 0.1% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 0 5-th percentile 2.65 Q1 2.91 median 3.51 Q3 4.03 95-th percentile 4.7405 Maximum 8.06 Range 8.06 Interquartile range (IQR) 1.12
Descriptive statistics
Standard deviation 0.6945528852 Coefficient of variation (CV) 0.1968085957 Kurtosis -0.2749503117 Mean 3.529078 Median Absolute Deviation (MAD) 0.56 Skewness 0.3966858454 Sum 35290.78 Variance 0.4824037103 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 2.69 166
1.7% 2.7 151
1.5% 2.71 139
1.4% 2.72 126
1.3% 2.68 125
1.2% 2.67 119
1.2% 2.73 115
1.1% 2.66 103
1.0% 3.55 99
1.0% 4.01 98
1.0% Other values (318) 8759 87.6%
Value Count Frequency (%) 0 4 < 0.1%
2.29 1
< 0.1% 2.3 1
< 0.1% 2.31 2 < 0.1%
2.32 2 < 0.1%
2.33 1
< 0.1% 2.35 1
< 0.1% 2.36 1
< 0.1% 2.37 2 < 0.1%
2.38 2 < 0.1%
Value Count Frequency (%) 8.06 1
< 0.1% 6.72 1
< 0.1% 6.17 1
< 0.1% 6.13 1
< 0.1% 5.91 1
< 0.1% 5.62 1
< 0.1% 5.61 1
< 0.1% 5.6 1
< 0.1% 5.58 3 < 0.1%
5.57 1
< 0.1%
Toggle correlation descriptions
Spearman's ρ The Spearman's rank correlation coefficient (ρ ) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's r . It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation. To calculate ρ for two variables X and Y , one divides the covariance of the rank variables of X and Y by the product of their standard deviations.
Pearson's r The Pearson's correlation coefficient (r ) is a measure of linear correlation between two variables. It's value lies between -1 and +1, -1 indicating total negative linear correlation, 0 indicating no linear correlation and 1 indicating total positive linear correlation. Furthermore, r is invariant under separate changes in location and scale of the two variables, implying that for a linear function the angle to the x-axis does not affect r . To calculate r for two variables X and Y , one divides the covariance of X and Y by the product of their standard deviations.
Kendall's τ Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (τ ) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. To calculate τ for two variables X and Y , one determines the number of concordant and discordant pairs of observations. τ is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.Cramér's V (φc) Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association. The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found
here .
Phik (φk) Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available
here .
A simple visualization of nullity by column.
Nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.
First rows df_index carat cut color clarity depth table price x y z 0 7753 0.64 Ideal D VVS1 61.5 56.0 4281 5.57 5.59 3.43 1 50394 0.52 Very Good E VVS2 61.2 58.0 2254 5.17 5.19 3.17 2 38989 0.36 Ideal G VVS1 62.8 57.0 1053 4.55 4.52 2.85 3 23162 1.32 Premium F VS1 61.7 59.0 11177 6.99 6.95 4.30 4 31445 0.40 Good J VS2 64.0 57.0 765 4.70 4.67 3.00 5 13207 1.21 Very Good I SI1 60.6 56.0 5457 6.89 6.97 4.20 6 35847 0.31 Ideal F VVS2 61.6 55.0 917 4.35 4.38 2.69 7 22987 1.56 Very Good H VS2 63.1 60.0 11039 7.43 7.34 4.66 8 46019 0.53 Good E VS2 63.7 55.0 1727 5.15 5.12 3.27 9 51497 0.70 Fair F VS1 65.8 59.0 2381 5.58 5.48 3.64
Last rows df_index carat cut color clarity depth table price x y z 9990 31357 0.27 Ideal F IF 63.0 55.0 760 4.13 4.16 2.61 9991 48169 0.79 Good F SI2 64.3 60.0 1943 5.82 5.75 3.72 9992 24629 2.01 Good I SI2 59.1 59.0 12964 8.09 8.14 4.80 9993 38145 0.30 Very Good G VVS1 63.5 57.0 1013 4.27 4.24 2.70 9994 12465 1.03 Ideal D SI1 62.3 56.0 5249 6.51 6.45 4.04 9995 15372 0.27 Very Good E VVS2 61.2 60.0 606 4.16 4.17 2.55 9996 18810 1.06 Good F VS1 58.5 57.0 7699 6.68 6.73 3.92 9997 6564 1.00 Good E SI2 61.5 64.0 4077 6.29 6.22 3.85 9998 2864 0.72 Ideal D VS2 60.5 57.0 3275 5.81 5.83 3.52 9999 8688 1.07 Ideal H VS2 62.1 57.0 4458 6.53 6.49 4.04
","textData":"\rSummarize dataset: 0%| | 0/5 [00:00<?, ?it/s]\rSummarize dataset: 0%| | 0/16 [00:00<?, ?it/s, Describe variable:table]\rSummarize dataset: 6%|▋ | 1/16 [00:00<00:05, 2.89it/s, Describe variable:table]\rSummarize dataset: 6%|▋ | 1/16 [00:00<00:05, 2.89it/s, Describe variable:depth]\rSummarize dataset: 12%|█▎ | 2/16 [00:00<00:02, 4.88it/s, Describe variable:depth]\rSummarize dataset: 12%|█▎ | 2/16 [00:00<00:02, 4.88it/s, Describe variable:y] \rSummarize dataset: 19%|█▉ | 3/16 [00:00<00:02, 4.88it/s, Describe variable:x]\rSummarize dataset: 25%|██▌ | 4/16 [00:00<00:02, 4.88it/s, Describe variable:carat]\rSummarize dataset: 31%|███▏ | 5/16 [00:00<00:02, 4.88it/s, Describe variable:z] \rSummarize dataset: 38%|███▊ | 6/16 [00:00<00:02, 4.88it/s, Describe variable:df_index]\rSummarize dataset: 44%|████▍ | 7/16 [00:00<00:01, 4.88it/s, Describe variable:price] \rSummarize dataset: 50%|█████ | 8/16 [00:00<00:01, 4.88it/s, Describe variable:color]\rSummarize dataset: 56%|█████▋ | 9/16 [00:00<00:00, 19.55it/s, Describe variable:color]\rSummarize dataset: 56%|█████▋ | 9/16 [00:00<00:00, 19.55it/s, Describe variable:cut] \rSummarize dataset: 62%|██████▎ | 10/16 [00:00<00:00, 19.55it/s, Describe variable:clarity]\rSummarize dataset: 69%|██████▉ | 11/16 [00:00<00:00, 19.55it/s, Get variable types] \rSummarize dataset: 55%|█████▍ | 12/22 [00:00<00:00, 19.55it/s, Calculate spearman correlation]\rSummarize dataset: 59%|█████▉ | 13/22 [00:00<00:00, 19.55it/s, Calculate pearson correlation] \rSummarize dataset: 64%|██████▎ | 14/22 [00:00<00:00, 19.55it/s, Calculate kendall correlation]\rSummarize dataset: 68%|██████▊ | 15/22 [00:00<00:00, 29.27it/s, Calculate kendall correlation]\rSummarize dataset: 68%|██████▊ | 15/22 [00:00<00:00, 29.27it/s, Calculate cramers correlation]\rSummarize dataset: 73%|███████▎ | 16/22 [00:00<00:00, 29.27it/s, Calculate phi_k correlation] \rSummarize dataset: 77%|███████▋ | 17/22 [00:02<00:00, 29.27it/s, Get scatter matrix] \rSummarize dataset: 20%|█▉ | 17/86 [00:02<00:02, 29.27it/s, scatter df_index, df_index]\rSummarize dataset: 21%|██ | 18/86 [00:02<00:02, 29.27it/s, scatter carat, df_index] \rSummarize dataset: 22%|██▏ | 19/86 [00:03<00:14, 4.77it/s, scatter carat, df_index]\rSummarize dataset: 22%|██▏ | 19/86 [00:03<00:14, 4.77it/s, scatter depth, df_index]\rSummarize dataset: 23%|██▎ | 20/86 [00:03<00:13, 4.77it/s, scatter table, df_index]\rSummarize dataset: 24%|██▍ | 21/86 [00:03<00:13, 4.77it/s, scatter price, df_index]\rSummarize dataset: 26%|██▌ | 22/86 [00:03<00:13, 4.87it/s, scatter price, df_index]\rSummarize dataset: 26%|██▌ | 22/86 [00:03<00:13, 4.87it/s, scatter x, df_index] \rSummarize dataset: 27%|██▋ | 23/86 [00:03<00:12, 4.87it/s, scatter y, df_index]\rSummarize dataset: 28%|██▊ | 24/86 [00:03<00:12, 4.93it/s, scatter y, df_index]\rSummarize dataset: 28%|██▊ | 24/86 [00:03<00:12, 4.93it/s, scatter z, df_index]\rSummarize dataset: 29%|██▉ | 25/86 [00:04<00:12, 4.93it/s, scatter df_index, carat]\rSummarize dataset: 30%|███ | 26/86 [00:04<00:12, 4.96it/s, scatter df_index, carat]\rSummarize dataset: 30%|███ | 26/86 [00:04<00:12, 4.96it/s, scatter carat, carat] \rSummarize dataset: 31%|███▏ | 27/86 [00:04<00:11, 4.96it/s, scatter depth, carat]\rSummarize dataset: 33%|███▎ | 28/86 [00:04<00:11, 5.01it/s, scatter depth, carat]\rSummarize dataset: 33%|███▎ | 28/86 [00:04<00:11, 5.01it/s, scatter table, carat]\rSummarize dataset: 34%|███▎ | 29/86 [00:04<00:11, 5.05it/s, scatter table, carat]\rSummarize dataset: 34%|███▎ | 29/86 [00:04<00:11, 5.05it/s, scatter price, carat]\rSummarize dataset: 35%|███▍ | 30/86 [00:05<00:11, 5.05it/s, scatter price, carat]\rSummarize dataset: 35%|███▍ | 30/86 [00:05<00:11, 5.05it/s, scatter x, carat] \rSummarize dataset: 36%|███▌ | 31/86 [00:05<00:10, 5.08it/s, scatter x, carat]\rSummarize dataset: 36%|███▌ | 31/86 [00:05<00:10, 5.08it/s, scatter y, carat]\rSummarize dataset: 37%|███▋ | 32/86 [00:05<00:10, 5.14it/s, scatter y, carat]\rSummarize dataset: 37%|███▋ | 32/86 [00:05<00:10, 5.14it/s, scatter z, carat]\rSummarize dataset: 38%|███▊ | 33/86 [00:05<00:10, 5.11it/s, scatter z, carat]\rSummarize dataset: 38%|███▊ | 33/86 [00:05<00:10, 5.11it/s, scatter df_index, depth]\rSummarize dataset: 40%|███▉ | 34/86 [00:05<00:10, 5.12it/s, scatter df_index, depth]\rSummarize dataset: 40%|███▉ | 34/86 [00:05<00:10, 5.12it/s, scatter carat, depth] \rSummarize dataset: 41%|████ | 35/86 [00:06<00:09, 5.14it/s, scatter carat, depth]\rSummarize dataset: 41%|████ | 35/86 [00:06<00:09, 5.14it/s, scatter depth, depth]\rSummarize dataset: 42%|████▏ | 36/86 [00:06<00:09, 5.10it/s, scatter depth, depth]\rSummarize dataset: 42%|████▏ | 36/86 [00:06<00:09, 5.10it/s, scatter table, depth]\rSummarize dataset: 43%|████▎ | 37/86 [00:06<00:09, 5.13it/s, scatter table, depth]\rSummarize dataset: 43%|████▎ | 37/86 [00:06<00:09, 5.13it/s, scatter price, depth]\rSummarize dataset: 44%|████▍ | 38/86 [00:06<00:09, 5.04it/s, scatter price, depth]\rSummarize dataset: 44%|████▍ | 38/86 [00:06<00:09, 5.04it/s, scatter x, depth] \rSummarize dataset: 45%|████▌ | 39/86 [00:06<00:09, 5.11it/s, scatter x, depth]\rSummarize dataset: 45%|████▌ | 39/86 [00:06<00:09, 5.11it/s, scatter y, depth]\rSummarize dataset: 47%|████▋ | 40/86 [00:07<00:08, 5.13it/s, scatter y, depth]\rSummarize dataset: 47%|████▋ | 40/86 [00:07<00:08, 5.13it/s, scatter z, depth]\rSummarize dataset: 48%|████▊ | 41/86 [00:07<00:08, 5.09it/s, scatter z, depth]\rSummarize dataset: 48%|████▊ | 41/86 [00:07<00:08, 5.09it/s, scatter df_index, table]\rSummarize dataset: 49%|████▉ | 42/86 [00:07<00:08, 5.14it/s, scatter df_index, table]\rSummarize dataset: 49%|████▉ | 42/86 [00:07<00:08, 5.14it/s, scatter carat, table] \rSummarize dataset: 50%|█████ | 43/86 [00:07<00:08, 5.22it/s, scatter carat, table]\rSummarize dataset: 50%|█████ | 43/86 [00:07<00:08, 5.22it/s, scatter depth, table]\rSummarize dataset: 51%|█████ | 44/86 [00:07<00:08, 5.15it/s, scatter depth, table]\rSummarize dataset: 51%|█████ | 44/86 [00:07<00:08, 5.15it/s, scatter table, table]\rSummarize dataset: 52%|█████▏ | 45/86 [00:08<00:07, 5.18it/s, scatter table, table]\rSummarize dataset: 52%|█████▏ | 45/86 [00:08<00:07, 5.18it/s, scatter price, table]\rSummarize dataset: 53%|█████▎ | 46/86 [00:08<00:07, 5.04it/s, scatter price, table]\rSummarize dataset: 53%|█████▎ | 46/86 [00:08<00:07, 5.04it/s, scatter x, table] \rSummarize dataset: 55%|█████▍ | 47/86 [00:08<00:07, 5.01it/s, scatter x, table]\rSummarize dataset: 55%|█████▍ | 47/86 [00:08<00:07, 5.01it/s, scatter y, table]\rSummarize dataset: 56%|█████▌ | 48/86 [00:08<00:09, 4.20it/s, scatter y, table]\rSummarize dataset: 56%|█████▌ | 48/86 [00:08<00:09, 4.20it/s, scatter z, table]\rSummarize dataset: 57%|█████▋ | 49/86 [00:08<00:08, 4.45it/s, scatter z, table]\rSummarize dataset: 57%|█████▋ | 49/86 [00:08<00:08, 4.45it/s, scatter df_index, price]\rSummarize dataset: 58%|█████▊ | 50/86 [00:09<00:07, 4.52it/s, scatter df_index, price]\rSummarize dataset: 58%|█████▊ | 50/86 [00:09<00:07, 4.52it/s, scatter carat, price] \rSummarize dataset: 59%|█████▉ | 51/86 [00:09<00:07, 4.68it/s, scatter carat, price]\rSummarize dataset: 59%|█████▉ | 51/86 [00:09<00:07, 4.68it/s, scatter depth, price]\rSummarize dataset: 60%|██████ | 52/86 [00:09<00:07, 4.70it/s, scatter depth, price]\rSummarize dataset: 60%|██████ | 52/86 [00:09<00:07, 4.70it/s, scatter table, price]\rSummarize dataset: 62%|██████▏ | 53/86 [00:09<00:06, 4.78it/s, scatter table, price]\rSummarize dataset: 62%|██████▏ | 53/86 [00:09<00:06, 4.78it/s, scatter price, price]\rSummarize dataset: 63%|██████▎ | 54/86 [00:10<00:06, 4.79it/s, scatter price, price]\rSummarize dataset: 63%|██████▎ | 54/86 [00:10<00:06, 4.79it/s, scatter x, price] \rSummarize dataset: 64%|██████▍ | 55/86 [00:10<00:06, 4.82it/s, scatter x, price]\rSummarize dataset: 64%|██████▍ | 55/86 [00:10<00:06, 4.82it/s, scatter y, price]\rSummarize dataset: 65%|██████▌ | 56/86 [00:10<00:06, 4.88it/s, scatter y, price]\rSummarize dataset: 65%|██████▌ | 56/86 [00:10<00:06, 4.88it/s, scatter z, price]\rSummarize dataset: 66%|██████▋ | 57/86 [00:10<00:06, 4.80it/s, scatter z, price]\rSummarize dataset: 66%|██████▋ | 57/86 [00:10<00:06, 4.80it/s, scatter df_index, x]\rSummarize dataset: 67%|██████▋ | 58/86 [00:10<00:05, 4.98it/s, scatter df_index, x]\rSummarize dataset: 67%|██████▋ | 58/86 [00:10<00:05, 4.98it/s, scatter carat, x] \rSummarize dataset: 69%|██████▊ | 59/86 [00:10<00:05, 5.17it/s, scatter carat, x]\rSummarize dataset: 69%|██████▊ | 59/86 [00:10<00:05, 5.17it/s, scatter depth, x]\rSummarize dataset: 70%|██████▉ | 60/86 [00:11<00:05, 5.08it/s, scatter depth, x]\rSummarize dataset: 70%|██████▉ | 60/86 [00:11<00:05, 5.08it/s, scatter table, x]\rSummarize dataset: 71%|███████ | 61/86 [00:11<00:04, 5.23it/s, scatter table, x]\rSummarize dataset: 71%|███████ | 61/86 [00:11<00:04, 5.23it/s, scatter price, x]\rSummarize dataset: 72%|███████▏ | 62/86 [00:11<00:04, 5.27it/s, scatter price, x]\rSummarize dataset: 72%|███████▏ | 62/86 [00:11<00:04, 5.27it/s, scatter x, x] \rSummarize dataset: 73%|███████▎ | 63/86 [00:11<00:04, 5.35it/s, scatter x, x]\rSummarize dataset: 73%|███████▎ | 63/86 [00:11<00:04, 5.35it/s, scatter y, x]\rSummarize dataset: 74%|███████▍ | 64/86 [00:11<00:04, 5.41it/s, scatter y, x]\rSummarize dataset: 74%|███████▍ | 64/86 [00:11<00:04, 5.41it/s, scatter z, x]\rSummarize dataset: 76%|███████▌ | 65/86 [00:12<00:03, 5.39it/s, scatter z, x]\rSummarize dataset: 76%|███████▌ | 65/86 [00:12<00:03, 5.39it/s, scatter df_index, y]\rSummarize dataset: 77%|███████▋ | 66/86 [00:12<00:03, 5.48it/s, scatter df_index, y]\rSummarize dataset: 77%|███████▋ | 66/86 [00:12<00:03, 5.48it/s, scatter carat, y] \rSummarize dataset: 78%|███████▊ | 67/86 [00:12<00:03, 5.57it/s, scatter carat, y]\rSummarize dataset: 78%|███████▊ | 67/86 [00:12<00:03, 5.57it/s, scatter depth, y]\rSummarize dataset: 79%|███████▉ | 68/86 [00:12<00:03, 5.50it/s, scatter depth, y]\rSummarize dataset: 79%|███████▉ | 68/86 [00:12<00:03, 5.50it/s, scatter table, y]\rSummarize dataset: 80%|████████ | 69/86 [00:12<00:03, 5.55it/s, scatter table, y]\rSummarize dataset: 80%|████████ | 69/86 [00:12<00:03, 5.55it/s, scatter price, y]\rSummarize dataset: 81%|████████▏ | 70/86 [00:13<00:02, 5.43it/s, scatter price, y]\rSummarize dataset: 81%|████████▏ | 70/86 [00:13<00:02, 5.43it/s, scatter x, y] \rSummarize dataset: 83%|████████▎ | 71/86 [00:13<00:02, 5.38it/s, scatter x, y]\rSummarize dataset: 83%|████████▎ | 71/86 [00:13<00:02, 5.38it/s, scatter y, y]\rSummarize dataset: 84%|████████▎ | 72/86 [00:13<00:02, 5.40it/s, scatter y, y]\rSummarize dataset: 84%|████████▎ | 72/86 [00:13<00:02, 5.40it/s, scatter z, y]\rSummarize dataset: 85%|████████▍ | 73/86 [00:13<00:02, 5.39it/s, scatter z, y]\rSummarize dataset: 85%|████████▍ | 73/86 [00:13<00:02, 5.39it/s, scatter df_index, z]\rSummarize dataset: 86%|████████▌ | 74/86 [00:13<00:02, 5.36it/s, scatter df_index, z]\rSummarize dataset: 86%|████████▌ | 74/86 [00:13<00:02, 5.36it/s, scatter carat, z] \rSummarize dataset: 87%|████████▋ | 75/86 [00:13<00:02, 5.40it/s, scatter carat, z]\rSummarize dataset: 87%|████████▋ | 75/86 [00:13<00:02, 5.40it/s, scatter depth, z]\rSummarize dataset: 88%|████████▊ | 76/86 [00:14<00:01, 5.35it/s, scatter depth, z]\rSummarize dataset: 88%|████████▊ | 76/86 [00:14<00:01, 5.35it/s, scatter table, z]\rSummarize dataset: 90%|████████▉ | 77/86 [00:14<00:02, 4.42it/s, scatter table, z]\rSummarize dataset: 90%|████████▉ | 77/86 [00:14<00:02, 4.42it/s, scatter price, z]\rSummarize dataset: 91%|█████████ | 78/86 [00:14<00:01, 4.59it/s, scatter price, z]\rSummarize dataset: 91%|█████████ | 78/86 [00:14<00:01, 4.59it/s, scatter x, z] \rSummarize dataset: 92%|█████████▏| 79/86 [00:14<00:01, 4.82it/s, scatter x, z]\rSummarize dataset: 92%|█████████▏| 79/86 [00:14<00:01, 4.82it/s, scatter y, z]\rSummarize dataset: 93%|█████████▎| 80/86 [00:15<00:01, 4.98it/s, scatter y, z]\rSummarize dataset: 93%|█████████▎| 80/86 [00:15<00:01, 4.98it/s, scatter z, z]\rSummarize dataset: 94%|█████████▍| 81/86 [00:15<00:00, 5.01it/s, scatter z, z]\rSummarize dataset: 94%|█████████▍| 81/86 [00:15<00:00, 5.01it/s, Get dataframe statistics]\rSummarize dataset: 93%|█████████▎| 82/88 [00:15<00:01, 5.01it/s, Missing diagram bar] \rSummarize dataset: 94%|█████████▍| 83/88 [00:15<00:00, 5.25it/s, Missing diagram bar]\rSummarize dataset: 94%|█████████▍| 83/88 [00:15<00:00, 5.25it/s, Missing diagram matrix]\rSummarize dataset: 95%|█████████▌| 84/88 [00:15<00:00, 5.76it/s, Missing diagram matrix]\rSummarize dataset: 95%|█████████▌| 84/88 [00:15<00:00, 5.76it/s, Take sample] \rSummarize dataset: 97%|█████████▋| 85/88 [00:15<00:00, 5.76it/s, Detecting duplicates]\rSummarize dataset: 98%|█████████▊| 86/88 [00:15<00:00, 5.76it/s, Get alerts] \rSummarize dataset: 99%|█████████▉| 87/88 [00:15<00:00, 5.76it/s, Get reproduction details]\rSummarize dataset: 100%|██████████| 88/88 [00:15<00:00, 5.76it/s, Completed] \rSummarize dataset: 100%|██████████| 88/88 [00:15<00:00, 5.60it/s, Completed]\n\rGenerate report structure: 0%| | 0/1 [00:00<?, ?it/s]\rGenerate report structure: 100%|██████████| 1/1 [00:05<00:00, 5.49s/it]\rGenerate report structure: 100%|██████████| 1/1 [00:05<00:00, 5.49s/it]\n\rRender HTML: 0%| | 0/1 [00:00<?, ?it/s]\rRender HTML: 100%|██████████| 1/1 [00:02<00:00, 2.41s/it]\rRender HTML: 100%|██████████| 1/1 [00:02<00:00, 2.41s/it]\n\rExport report to file: 0%| | 0/1 [00:00<?, ?it/s]\rExport report to file: 100%|██████████| 1/1 [00:00<00:00, 94.40it/s]\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["Pandas Profiling Report Dataset statistics
Number of variables 11 Number of observations 10000 Missing cells 0 Missing cells (%) 0.0% Duplicate rows 0 Duplicate rows (%) 0.0% Total size in memory 859.5 KiB Average record size in memory 88.0 B
Alerts
carat
is highly correlated with price
and 3 other fields High correlation price
is highly correlated with carat
and 3 other fields High correlation x
is highly correlated with carat
and 3 other fields High correlation y
is highly correlated with carat
and 3 other fields High correlation z
is highly correlated with carat
and 3 other fields High correlation carat
is highly correlated with price
and 3 other fields High correlation price
is highly correlated with carat
and 3 other fields High correlation x
is highly correlated with carat
and 3 other fields High correlation y
is highly correlated with carat
and 3 other fields High correlation z
is highly correlated with carat
and 3 other fields High correlation carat
is highly correlated with price
and 3 other fields High correlation price
is highly correlated with carat
and 3 other fields High correlation x
is highly correlated with carat
and 3 other fields High correlation y
is highly correlated with carat
and 3 other fields High correlation z
is highly correlated with carat
and 3 other fields High correlation df_index
is highly correlated with carat
and 3 other fields High correlation carat
is highly correlated with df_index
and 4 other fields High correlation cut
is highly correlated with depth
and 1 other fields High correlation depth
is highly correlated with cut
High correlation table
is highly correlated with cut
High correlation price
is highly correlated with df_index
and 3 other fields High correlation x
is highly correlated with df_index
and 4 other fields High correlation y
is highly correlated with carat
and 2 other fields High correlation z
is highly correlated with df_index
and 4 other fields High correlation df_index
has unique values Unique
Reproduction
Analysis started 2021-12-09 22:36:56.032662 Analysis finished 2021-12-09 22:37:11.760218 Duration 15.73 seconds Software version pandas-profiling v3.1.0 Download configuration config.json
df_index Real number (ℝ≥0 )
HIGH CORRELATION
UNIQUE
Distinct 10000 Distinct (%) 100.0% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 27088.2074
Minimum 1 Maximum 53922 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 1 5-th percentile 2652.65 Q1 13355.75 median 27192 Q3 40817 95-th percentile 51315.1 Maximum 53922 Range 53921 Interquartile range (IQR) 27461.25
Descriptive statistics
Standard deviation 15650.20796 Coefficient of variation (CV) 0.5777498573 Kurtosis -1.217596069 Mean 27088.2074 Median Absolute Deviation (MAD) 13724 Skewness -0.01240652578 Sum 270882074 Variance 244929009.2 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 49152 1
< 0.1% 26099 1
< 0.1% 45763 1
< 0.1% 51908 1
< 0.1% 17093 1
< 0.1% 30327 1
< 0.1% 21372 1
< 0.1% 15050 1
< 0.1% 51916 1
< 0.1% 37583 1
< 0.1% Other values (9990) 9990 99.9%
Value Count Frequency (%) 1 1 < 0.1%
2 1 < 0.1%
6 1 < 0.1%
10 1 < 0.1%
11 1 < 0.1%
17 1 < 0.1%
18 1 < 0.1%
28 1 < 0.1%
55 1 < 0.1%
59 1 < 0.1%
Value Count Frequency (%) 53922 1 < 0.1%
53920 1 < 0.1%
53918 1 < 0.1%
53914 1 < 0.1%
53912 1 < 0.1%
53907 1 < 0.1%
53906 1 < 0.1%
53905 1 < 0.1%
53901 1 < 0.1%
53891 1 < 0.1%
carat Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 237 Distinct (%) 2.4% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 0.791456
Minimum 0.2 Maximum 4.5 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 0.2 5-th percentile 0.3 Q1 0.4 median 0.7 Q3 1.04 95-th percentile 1.71 Maximum 4.5 Range 4.3 Interquartile range (IQR) 0.64
Descriptive statistics
Standard deviation 0.4728613326 Coefficient of variation (CV) 0.597457512 Kurtosis 1.490228108 Mean 0.791456 Median Absolute Deviation (MAD) 0.32 Skewness 1.162005658 Sum 7914.56 Variance 0.2235978398 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 0.3 481
4.8% 0.31 447
4.5% 1.01 386
3.9% 0.7 368
3.7% 0.32 346
3.5% 1 287
2.9% 0.9 262
2.6% 0.4 259
2.6% 0.71 237
2.4% 0.5 232
2.3% Other values (227) 6695 67.0%
Value Count Frequency (%) 0.2 2
< 0.1% 0.21 4
< 0.1% 0.22 1
< 0.1% 0.23 48 0.5%
0.24 49 0.5%
0.25 41 0.4%
0.26 44 0.4%
0.27 45 0.4%
0.28 29 0.3%
0.29 24 0.2%
Value Count Frequency (%) 4.5 1 < 0.1%
4.01 1 < 0.1%
3.67 1 < 0.1%
3.02 1 < 0.1%
3.01 2 < 0.1%
3 1 < 0.1%
2.75 1 < 0.1%
2.74 1 < 0.1%
2.68 1 < 0.1%
2.65 1 < 0.1%
cut Categorical
HIGH CORRELATION
Distinct 5 Distinct (%) 0.1% Missing 0 Missing (%) 0.0% Memory size 78.2 KiB
Ideal 3931
Premium 2603
Very Good 2308
Good 883
Fair
275
Toggle details
Length
Max length 9 Median length 5 Mean length 6.328 Min length 4
Characters and Unicode
Total characters 0 Distinct characters 0 Distinct categories 0 ? Distinct scripts 0 ? Distinct blocks 0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.
Sample
1st row Ideal 2nd row Very Good 3rd row Ideal 4th row Premium 5th row Good
Common Values Value Count Frequency (%) Ideal 3931 39.3%
Premium 2603 26.0%
Very Good 2308 23.1%
Good 883
8.8% Fair 275
2.8%
Length Histogram of lengths of the category
Value Count Frequency (%) ideal 3931 31.9%
good 3191 25.9%
premium 2603 21.1%
very 2308 18.8%
fair 275
2.2%
Most occurring characters Value Count Frequency (%) No values found.
Most occurring categories Value Count Frequency (%) No values found.
Most frequent character per category Most occurring scripts Value Count Frequency (%) No values found.
Most frequent character per script Most occurring blocks Value Count Frequency (%) No values found.
Most frequent character per block Distinct 7 Distinct (%) 0.1% Missing 0 Missing (%) 0.0% Memory size 78.2 KiB
G 2089
E 1811
F 1803
H 1559
D 1252
Other values (2) 1486
Toggle details
Length
Max length 1 Median length 1 Mean length 1 Min length 1
Characters and Unicode
Total characters 0 Distinct characters 0 Distinct categories 0 ? Distinct scripts 0 ? Distinct blocks 0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.
Sample
1st row D 2nd row E 3rd row G 4th row F 5th row J
Common Values Value Count Frequency (%) G 2089 20.9%
E 1811 18.1%
F 1803 18.0%
H 1559 15.6%
D 1252 12.5%
I 1001 10.0%
J 485
4.9%
Length Histogram of lengths of the category
Value Count Frequency (%) g 2089 20.9%
e 1811 18.1%
f 1803 18.0%
h 1559 15.6%
d 1252 12.5%
i 1001 10.0%
j 485
4.9%
Most occurring characters Value Count Frequency (%) No values found.
Most occurring categories Value Count Frequency (%) No values found.
Most frequent character per category Most occurring scripts Value Count Frequency (%) No values found.
Most frequent character per script Most occurring blocks Value Count Frequency (%) No values found.
Most frequent character per block Distinct 8 Distinct (%) 0.1% Missing 0 Missing (%) 0.0% Memory size 78.2 KiB
SI1 2422
VS2 2284
SI2 1719
VS1 1528
VVS2 905
Other values (3) 1142
Toggle details
Length
Max length 4 Median length 3 Mean length 3.1147 Min length 2
Characters and Unicode
Total characters 0 Distinct characters 0 Distinct categories 0 ? Distinct scripts 0 ? Distinct blocks 0 ?
The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables.
Sample
1st row VVS1 2nd row VVS2 3rd row VVS1 4th row VS1 5th row VS2
Common Values Value Count Frequency (%) SI1 2422 24.2%
VS2 2284 22.8%
SI2 1719 17.2%
VS1 1528 15.3%
VVS2 905
9.0% VVS1 692
6.9% IF 319
3.2% I1 131
1.3%
Length Histogram of lengths of the category
Value Count Frequency (%) si1 2422 24.2%
vs2 2284 22.8%
si2 1719 17.2%
vs1 1528 15.3%
vvs2 905
9.0% vvs1 692
6.9% if 319
3.2% i1 131
1.3%
Most occurring characters Value Count Frequency (%) No values found.
Most occurring categories Value Count Frequency (%) No values found.
Most frequent character per category Most occurring scripts Value Count Frequency (%) No values found.
Most frequent character per script Most occurring blocks Value Count Frequency (%) No values found.
Most frequent character per block depth Real number (ℝ≥0 )
HIGH CORRELATION
Distinct 139 Distinct (%) 1.4% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 61.744
Minimum 44 Maximum 73.6 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 44 5-th percentile 59.3 Q1 61.1 median 61.8 Q3 62.5 95-th percentile 63.705 Maximum 73.6 Range 29.6 Interquartile range (IQR) 1.4
Descriptive statistics
Standard deviation 1.416036199 Coefficient of variation (CV) 0.02293398871 Kurtosis 6.215816952 Mean 61.744 Median Absolute Deviation (MAD) 0.7 Skewness -0.2259760176 Sum 617440 Variance 2.005158516 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 62 409
4.1% 61.9 403
4.0% 62.2 387
3.9% 61.6 384
3.8% 62.1 376
3.8% 61.8 373
3.7% 61.7 362
3.6% 62.3 360
3.6% 62.4 354
3.5% 61.5 301
3.0% Other values (129) 6291 62.9%
Value Count Frequency (%) 44 1
< 0.1% 52.3 1
< 0.1% 53.1 1
< 0.1% 55.2 1
< 0.1% 55.3 2 < 0.1%
55.5 2 < 0.1%
55.6 2 < 0.1%
55.8 2 < 0.1%
55.9 3 < 0.1%
56 3 < 0.1%
Value Count Frequency (%) 73.6 1
< 0.1% 72.9 1
< 0.1% 71.2 1
< 0.1% 70.5 1
< 0.1% 70.2 1
< 0.1% 70.1 1
< 0.1% 69.5 1
< 0.1% 69 1
< 0.1% 68.9 3 < 0.1%
68.7 1
< 0.1%
table Real number (ℝ≥0 )
HIGH CORRELATION
Distinct 90 Distinct (%) 0.9% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 57.44649
Minimum 43 Maximum 71 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 43 5-th percentile 54 Q1 56 median 57 Q3 59 95-th percentile 61 Maximum 71 Range 28 Interquartile range (IQR) 3
Descriptive statistics
Standard deviation 2.223800847 Coefficient of variation (CV) 0.03871082197 Kurtosis 1.249490598 Mean 57.44649 Median Absolute Deviation (MAD) 1 Skewness 0.6529730234 Sum 574464.9 Variance 4.945290209 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 57 1817 18.2%
56 1756 17.6%
58 1556 15.6%
59 1254 12.5%
55 1213 12.1%
60 785 7.8%
54 491
4.9% 61 404
4.0% 62 232
2.3% 63 111
1.1% Other values (80) 381
3.8%
Value Count Frequency (%) 43 1
< 0.1% 49 1
< 0.1% 51 3
< 0.1% 52 19
0.2% 52.8 1
< 0.1% 53 94 0.9%
53.1 1
< 0.1% 53.4 1
< 0.1% 53.5 3
< 0.1% 53.6 3
< 0.1%
Value Count Frequency (%) 71 1
< 0.1% 70 2
< 0.1% 69 3
< 0.1% 68 3
< 0.1% 67 8
0.1% 66 15
0.1% 65 24
0.2% 64 49 0.5%
63 111 1.1%
62.5 1
< 0.1%
price Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 5091 Distinct (%) 50.9% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 3873.9908
Minimum 326 Maximum 18795 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 326 5-th percentile 544 Q1 950 median 2361 Q3 5226 95-th percentile 13016.15 Maximum 18795 Range 18469 Interquartile range (IQR) 4276
Descriptive statistics
Standard deviation 3965.051797 Coefficient of variation (CV) 1.023505734 Kurtosis 2.363425826 Mean 3873.9908 Median Absolute Deviation (MAD) 1631.5 Skewness 1.663784166 Sum 38739908 Variance 15721635.75 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 544 32
0.3% 625 31
0.3% 605 30
0.3% 776 29
0.3% 698 28
0.3% 828 25
0.2% 684 24
0.2% 720 23
0.2% 789 23
0.2% 552 23
0.2% Other values (5081) 9732 97.3%
Value Count Frequency (%) 326 1 < 0.1%
327 1 < 0.1%
336 1 < 0.1%
339 1 < 0.1%
340 1 < 0.1%
351 2 < 0.1%
357 2 < 0.1%
362 1 < 0.1%
363 1 < 0.1%
364 1 < 0.1%
Value Count Frequency (%) 18795 1 < 0.1%
18784 1 < 0.1%
18781 1 < 0.1%
18779 1 < 0.1%
18766 1 < 0.1%
18745 1 < 0.1%
18741 1 < 0.1%
18706 1 < 0.1%
18678 1 < 0.1%
18611 1 < 0.1%
x Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 495 Distinct (%) 5.0% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 5.716036
Minimum 0 Maximum 10.23 Zeros 1 Zeros (%) < 0.1% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 0 5-th percentile 4.29 Q1 4.71 median 5.68 Q3 6.53 95-th percentile 7.65 Maximum 10.23 Range 10.23 Interquartile range (IQR) 1.82
Descriptive statistics
Standard deviation 1.118243951 Coefficient of variation (CV) 0.1956327691 Kurtosis -0.6066053655 Mean 5.716036 Median Absolute Deviation (MAD) 0.92 Skewness 0.4124316947 Sum 57160.36 Variance 1.250469534 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 4.34 92
0.9% 4.35 92
0.9% 4.39 84
0.8% 4.36 81
0.8% 4.33 75
0.8% 4.32 74
0.7% 4.31 73
0.7% 4.3 72
0.7% 4.42 71
0.7% 4.37 69
0.7% Other values (485) 9217 92.2%
Value Count Frequency (%) 0 1
< 0.1% 3.81 2
< 0.1% 3.85 2
< 0.1% 3.86 1
< 0.1% 3.88 1
< 0.1% 3.89 3 < 0.1%
3.9 3 < 0.1%
3.91 5 0.1%
3.92 5 0.1%
3.93 6 0.1%
Value Count Frequency (%) 10.23 1 < 0.1%
10.14 1 < 0.1%
9.86 1 < 0.1%
9.44 2 < 0.1%
9.3 1 < 0.1%
9.11 1 < 0.1%
9.04 1 < 0.1%
8.99 1 < 0.1%
8.9 1 < 0.1%
8.88 1 < 0.1%
y Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 500 Distinct (%) 5.0% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 5.726097
Minimum 3.77 Maximum 58.9 Zeros 0 Zeros (%) 0.0% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 3.77 5-th percentile 4.3 Q1 4.72 median 5.7 Q3 6.53 95-th percentile 7.65 Maximum 58.9 Range 55.13 Interquartile range (IQR) 1.81
Descriptive statistics
Standard deviation 1.256195462 Coefficient of variation (CV) 0.2193807514 Kurtosis 338.2400271 Mean 5.726097 Median Absolute Deviation (MAD) 0.92 Skewness 8.753843386 Sum 57260.97 Variance 1.578027039 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 4.38 99
1.0% 4.39 89
0.9% 4.37 88
0.9% 4.32 88
0.9% 4.34 88
0.9% 4.36 83
0.8% 4.35 80
0.8% 4.33 77
0.8% 4.41 69
0.7% 4.31 65
0.7% Other values (490) 9174 91.7%
Value Count Frequency (%) 3.77 1
< 0.1% 3.78 1
< 0.1% 3.81 1
< 0.1% 3.84 1
< 0.1% 3.85 1
< 0.1% 3.86 1
< 0.1% 3.89 1
< 0.1% 3.9 5 0.1%
3.92 1
< 0.1% 3.93 3 < 0.1%
Value Count Frequency (%) 58.9 1 < 0.1%
31.8 1 < 0.1%
10.16 1 < 0.1%
10.1 1 < 0.1%
9.81 1 < 0.1%
9.38 1 < 0.1%
9.37 1 < 0.1%
9.14 1 < 0.1%
9.02 1 < 0.1%
8.98 1 < 0.1%
z Real number (ℝ≥0 )
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
HIGH CORRELATION
Distinct 328 Distinct (%) 3.3% Missing 0 Missing (%) 0.0% Infinite 0 Infinite (%) 0.0% Mean 3.529078
Minimum 0 Maximum 8.06 Zeros 4 Zeros (%) < 0.1% Negative 0 Negative (%) 0.0% Memory size 78.2 KiB
Toggle details
Quantile statistics
Minimum 0 5-th percentile 2.65 Q1 2.91 median 3.51 Q3 4.03 95-th percentile 4.7405 Maximum 8.06 Range 8.06 Interquartile range (IQR) 1.12
Descriptive statistics
Standard deviation 0.6945528852 Coefficient of variation (CV) 0.1968085957 Kurtosis -0.2749503117 Mean 3.529078 Median Absolute Deviation (MAD) 0.56 Skewness 0.3966858454 Sum 35290.78 Variance 0.4824037103 Monotonicity Not monotonic
Histogram with fixed size bins (bins=50)
Value Count Frequency (%) 2.69 166
1.7% 2.7 151
1.5% 2.71 139
1.4% 2.72 126
1.3% 2.68 125
1.2% 2.67 119
1.2% 2.73 115
1.1% 2.66 103
1.0% 3.55 99
1.0% 4.01 98
1.0% Other values (318) 8759 87.6%
Value Count Frequency (%) 0 4 < 0.1%
2.29 1
< 0.1% 2.3 1
< 0.1% 2.31 2 < 0.1%
2.32 2 < 0.1%
2.33 1
< 0.1% 2.35 1
< 0.1% 2.36 1
< 0.1% 2.37 2 < 0.1%
2.38 2 < 0.1%
Value Count Frequency (%) 8.06 1
< 0.1% 6.72 1
< 0.1% 6.17 1
< 0.1% 6.13 1
< 0.1% 5.91 1
< 0.1% 5.62 1
< 0.1% 5.61 1
< 0.1% 5.6 1
< 0.1% 5.58 3 < 0.1%
5.57 1
< 0.1%
Toggle correlation descriptions
Spearman's ρ The Spearman's rank correlation coefficient (ρ ) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's r . It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation. To calculate ρ for two variables X and Y , one divides the covariance of the rank variables of X and Y by the product of their standard deviations.
Pearson's r The Pearson's correlation coefficient (r ) is a measure of linear correlation between two variables. It's value lies between -1 and +1, -1 indicating total negative linear correlation, 0 indicating no linear correlation and 1 indicating total positive linear correlation. Furthermore, r is invariant under separate changes in location and scale of the two variables, implying that for a linear function the angle to the x-axis does not affect r . To calculate r for two variables X and Y , one divides the covariance of X and Y by the product of their standard deviations.
Kendall's τ Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (τ ) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. To calculate τ for two variables X and Y , one determines the number of concordant and discordant pairs of observations. τ is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.Cramér's V (φc) Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association. The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found
here .
Phik (φk) Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available
here .
A simple visualization of nullity by column.
Nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.
First rows df_index carat cut color clarity depth table price x y z 0 7753 0.64 Ideal D VVS1 61.5 56.0 4281 5.57 5.59 3.43 1 50394 0.52 Very Good E VVS2 61.2 58.0 2254 5.17 5.19 3.17 2 38989 0.36 Ideal G VVS1 62.8 57.0 1053 4.55 4.52 2.85 3 23162 1.32 Premium F VS1 61.7 59.0 11177 6.99 6.95 4.30 4 31445 0.40 Good J VS2 64.0 57.0 765 4.70 4.67 3.00 5 13207 1.21 Very Good I SI1 60.6 56.0 5457 6.89 6.97 4.20 6 35847 0.31 Ideal F VVS2 61.6 55.0 917 4.35 4.38 2.69 7 22987 1.56 Very Good H VS2 63.1 60.0 11039 7.43 7.34 4.66 8 46019 0.53 Good E VS2 63.7 55.0 1727 5.15 5.12 3.27 9 51497 0.70 Fair F VS1 65.8 59.0 2381 5.58 5.48 3.64
Last rows df_index carat cut color clarity depth table price x y z 9990 31357 0.27 Ideal F IF 63.0 55.0 760 4.13 4.16 2.61 9991 48169 0.79 Good F SI2 64.3 60.0 1943 5.82 5.75 3.72 9992 24629 2.01 Good I SI2 59.1 59.0 12964 8.09 8.14 4.80 9993 38145 0.30 Very Good G VVS1 63.5 57.0 1013 4.27 4.24 2.70 9994 12465 1.03 Ideal D SI1 62.3 56.0 5249 6.51 6.45 4.04 9995 15372 0.27 Very Good E VVS2 61.2 60.0 606 4.16 4.17 2.55 9996 18810 1.06 Good F VS1 58.5 57.0 7699 6.68 6.73 3.92 9997 6564 1.00 Good E SI2 61.5 64.0 4077 6.29 6.22 3.85 9998 2864 0.72 Ideal D VS2 60.5 57.0 3275 5.81 5.83 3.52 9999 8688 1.07 Ideal H VS2 62.1 57.0 4458 6.53 6.49 4.04
"]}}],"execution_count":0},{"cell_type":"code","source":["df = spark.table(\"bling\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"deb29a46-ad1b-4c5e-9cb6-7089e52a6d60"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[{"name":"df","typeStr":"pyspark.sql.dataframe.DataFrame","schema":{"fields":[{"metadata":{},"name":"_c0","nullable":true,"type":"integer"},{"metadata":{},"name":"carat","nullable":true,"type":"double"},{"metadata":{},"name":"cut","nullable":true,"type":"string"},{"metadata":{},"name":"color","nullable":true,"type":"string"},{"metadata":{},"name":"clarity","nullable":true,"type":"string"},{"metadata":{},"name":"depth","nullable":true,"type":"double"},{"metadata":{},"name":"table","nullable":true,"type":"double"},{"metadata":{},"name":"price","nullable":true,"type":"integer"},{"metadata":{},"name":"x","nullable":true,"type":"double"},{"metadata":{},"name":"y","nullable":true,"type":"double"},{"metadata":{},"name":"z","nullable":true,"type":"double"}],"type":"struct"},"tableIdentifier":"dbfs:/delta/diamonds"}],"data":"
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":0},{"cell_type":"code","source":["#first lets build the method\nimport pandas as pd\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.functions import isnan, when, count, col\n\ndef dataprofile(data_all_df,data_cols):\n data_df = data_all_df.select(data_cols)\n columns2Bprofiled = data_df.columns\n global schema_name, table_name\n if not 'schema_name' in globals():\n schema_name = 'schema_name'\n if not 'table_name' in globals():\n table_name = 'table_name' \n dprof_df = pd.DataFrame({'schema_name':[schema_name] * len(data_df.columns),\\\n 'table_name':[table_name] * len(data_df.columns),\\\n 'column_names':data_df.columns,\\\n 'data_types':[x[1] for x in data_df.dtypes]}) \n dprof_df = dprof_df[['schema_name','table_name','column_names', 'data_types']]\n #dprof_df.set_index('column_names', inplace=True, drop=False)\n # ======================\n num_rows = data_df.count()\n dprof_df['num_rows'] = num_rows\n # ====================== \n # number of rows with nulls and nans \n df_nacounts = data_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data_df.columns \\\n if data_df.select(c).dtypes[0][1]!='timestamp']).toPandas().transpose()\n df_nacounts = df_nacounts.reset_index() \n df_nacounts.columns = ['column_names','num_null']\n dprof_df = pd.merge(dprof_df, df_nacounts, on = ['column_names'], how = 'left')\n # ========================\n # number of rows with white spaces (one or more space) or blanks\n num_spaces = [data_df.where(F.col(c).rlike('^\\\\s+$')).count() for c in data_df.columns]\n dprof_df['num_spaces'] = num_spaces\n num_blank = [data_df.where(F.col(c)=='').count() for c in data_df.columns]\n dprof_df['num_blank'] = num_blank\n # =========================\n # using the in built describe() function \n desc_df = data_df.describe().toPandas().transpose()\n desc_df.columns = ['count', 'mean', 'stddev', 'min', 'max']\n desc_df = desc_df.iloc[1:,:] \n desc_df = desc_df.reset_index() \n desc_df.columns.values[0] = 'column_names' \n desc_df = desc_df[['column_names','count', 'mean', 'stddev']] \n dprof_df = pd.merge(dprof_df, desc_df , on = ['column_names'], how = 'left')\n # ===========================================\n try:\n allminvalues = [data_df.select(F.min(x)).limit(1).toPandas().iloc[0][0] for x in columns2Bprofiled]\n allmaxvalues = [data_df.select(F.max(x)).limit(1).toPandas().iloc[0][0] for x in columns2Bprofiled]\n except:\n allminvalues = None\n allmaxvalues = None\n try:\n allmincounts = None\n allmaxcounts = None\n except:\n print('')\n \n df_counts = dprof_df[['column_names']]\n df_counts.insert(loc=0, column='min', value=allminvalues)\n df_counts.insert(loc=0, column='counts_min', value=allmincounts)\n df_counts.insert(loc=0, column='max', value=allmaxvalues)\n df_counts.insert(loc=0, column='counts_max', value=allmaxcounts)\n df_counts = df_counts[['column_names','min','counts_min','max','counts_max']]\n dprof_df = pd.merge(dprof_df, df_counts , on = ['column_names'], how = 'left') \n # ==========================================\n # number of distinct values in each column\n dprof_df['num_distinct'] = [data_df.select(x).distinct().count() for x in columns2Bprofiled]\n # ============================================\n # most frequently occuring value in a column and its count\n dprof_df['most_freq_valwcount'] = [data_df.groupBy(x).count().sort(\"count\",ascending=False).limit(1).\\\n toPandas().iloc[0].values.tolist() for x in columns2Bprofiled]\n dprof_df['most_freq_value'] = [x[0] for x in dprof_df['most_freq_valwcount']]\n dprof_df['most_freq_value_count'] = [x[1] for x in dprof_df['most_freq_valwcount']]\n dprof_df = dprof_df.drop(['most_freq_valwcount'],axis=1)\n # least frequently occuring value in a column and its count\n dprof_df['least_freq_valwcount'] = [data_df.groupBy(x).count().sort(\"count\",ascending=True).limit(1).\\\n toPandas().iloc[0].values.tolist() for x in columns2Bprofiled]\n dprof_df['least_freq_value'] = [x[0] for x in dprof_df['least_freq_valwcount']]\n dprof_df['least_freq_value_count'] = [x[1] for x in dprof_df['least_freq_valwcount']]\n dprof_df = dprof_df.drop(['least_freq_valwcount'],axis=1)\n\n return dprof_df\n \nprint('done')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"193ff84b-1826-49b3-8a34-2e2a5baef5f2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"done\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\ndone\n
"]}}],"execution_count":0},{"cell_type":"code","source":["dc = df.columns\ndp = dataprofile(df,dc)\ndp"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e16af345-88a4-4683-84d2-906f5af6d200"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"\n\n
\n \n \n \n schema_name \n table_name \n column_names \n data_types \n num_rows \n num_null \n num_spaces \n num_blank \n count \n mean \n stddev \n min \n counts_min \n max \n counts_max \n num_distinct \n most_freq_value \n most_freq_value_count \n least_freq_value \n least_freq_value_count \n \n \n \n \n 0 \n schema_name \n table_name \n _c0 \n int \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 26970.5 \n 15571.281096942537 \n 1 \n None \n 53940 \n None \n 53940 \n 148 \n 1.0 \n 148 \n 1.0 \n \n \n 1 \n schema_name \n table_name \n carat \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 0.7979397478679852 \n 0.4740112444054196 \n 0.2 \n None \n 5.01 \n None \n 273 \n 0.3 \n 2604.0 \n 3.02 \n 1.0 \n \n \n 2 \n schema_name \n table_name \n cut \n string \n 53940 \n 0 \n 0 \n 0 \n 53940 \n None \n None \n Fair \n None \n Very Good \n None \n 5 \n Ideal \n 21551.0 \n Fair \n 1610.0 \n \n \n 3 \n schema_name \n table_name \n color \n string \n 53940 \n 0 \n 0 \n 0 \n 53940 \n None \n None \n D \n None \n J \n None \n 7 \n G \n 11292.0 \n J \n 2808.0 \n \n \n 4 \n schema_name \n table_name \n clarity \n string \n 53940 \n 0 \n 0 \n 0 \n 53940 \n None \n None \n I1 \n None \n VVS2 \n None \n 8 \n SI1 \n 13065.0 \n I1 \n 741.0 \n \n \n 5 \n schema_name \n table_name \n depth \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 61.74940489432624 \n 1.4326213188336525 \n 43.0 \n None \n 79.0 \n None \n 184 \n 62.0 \n 2239.0 \n 53.3 \n 1.0 \n \n \n 6 \n schema_name \n table_name \n table \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 57.45718390804603 \n 2.2344905628213247 \n 43.0 \n None \n 95.0 \n None \n 127 \n 56.0 \n 9881.0 \n 64.2 \n 1.0 \n \n \n 7 \n schema_name \n table_name \n price \n int \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 3932.799721913237 \n 3989.439738146397 \n 326 \n None \n 18823 \n None \n 11602 \n 605 \n 132.0 \n 4519 \n 1.0 \n \n \n 8 \n schema_name \n table_name \n x \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 5.731157211716609 \n 1.1217607467924915 \n 0.0 \n None \n 10.74 \n None \n 554 \n 4.37 \n 448.0 \n 9.13 \n 1.0 \n \n \n 9 \n schema_name \n table_name \n y \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 5.734525954764462 \n 1.1421346741235616 \n 0.0 \n None \n 58.9 \n None \n 552 \n 4.34 \n 437.0 \n 58.9 \n 1.0 \n \n \n 10 \n schema_name \n table_name \n z \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 3.5387337782723316 \n 0.7056988469499883 \n 0.0 \n None \n 31.8 \n None \n 375 \n 2.7 \n 767.0 \n 5.86 \n 1.0 \n \n \n
\n
","textData":"Out[5]:
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"htmlSandbox","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n\n
\n \n \n \n schema_name \n table_name \n column_names \n data_types \n num_rows \n num_null \n num_spaces \n num_blank \n count \n mean \n stddev \n min \n counts_min \n max \n counts_max \n num_distinct \n most_freq_value \n most_freq_value_count \n least_freq_value \n least_freq_value_count \n \n \n \n \n 0 \n schema_name \n table_name \n _c0 \n int \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 26970.5 \n 15571.281096942537 \n 1 \n None \n 53940 \n None \n 53940 \n 148 \n 1.0 \n 148 \n 1.0 \n \n \n 1 \n schema_name \n table_name \n carat \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 0.7979397478679852 \n 0.4740112444054196 \n 0.2 \n None \n 5.01 \n None \n 273 \n 0.3 \n 2604.0 \n 3.02 \n 1.0 \n \n \n 2 \n schema_name \n table_name \n cut \n string \n 53940 \n 0 \n 0 \n 0 \n 53940 \n None \n None \n Fair \n None \n Very Good \n None \n 5 \n Ideal \n 21551.0 \n Fair \n 1610.0 \n \n \n 3 \n schema_name \n table_name \n color \n string \n 53940 \n 0 \n 0 \n 0 \n 53940 \n None \n None \n D \n None \n J \n None \n 7 \n G \n 11292.0 \n J \n 2808.0 \n \n \n 4 \n schema_name \n table_name \n clarity \n string \n 53940 \n 0 \n 0 \n 0 \n 53940 \n None \n None \n I1 \n None \n VVS2 \n None \n 8 \n SI1 \n 13065.0 \n I1 \n 741.0 \n \n \n 5 \n schema_name \n table_name \n depth \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 61.74940489432624 \n 1.4326213188336525 \n 43.0 \n None \n 79.0 \n None \n 184 \n 62.0 \n 2239.0 \n 53.3 \n 1.0 \n \n \n 6 \n schema_name \n table_name \n table \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 57.45718390804603 \n 2.2344905628213247 \n 43.0 \n None \n 95.0 \n None \n 127 \n 56.0 \n 9881.0 \n 64.2 \n 1.0 \n \n \n 7 \n schema_name \n table_name \n price \n int \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 3932.799721913237 \n 3989.439738146397 \n 326 \n None \n 18823 \n None \n 11602 \n 605 \n 132.0 \n 4519 \n 1.0 \n \n \n 8 \n schema_name \n table_name \n x \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 5.731157211716609 \n 1.1217607467924915 \n 0.0 \n None \n 10.74 \n None \n 554 \n 4.37 \n 448.0 \n 9.13 \n 1.0 \n \n \n 9 \n schema_name \n table_name \n y \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 5.734525954764462 \n 1.1421346741235616 \n 0.0 \n None \n 58.9 \n None \n 552 \n 4.34 \n 437.0 \n 58.9 \n 1.0 \n \n \n 10 \n schema_name \n table_name \n z \n double \n 53940 \n 0 \n 0 \n 0 \n 53940 \n 3.5387337782723316 \n 0.7056988469499883 \n 0.0 \n None \n 31.8 \n None \n 375 \n 2.7 \n 767.0 \n 5.86 \n 1.0 \n \n \n
\n
"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b6a4e85d-8213-4fcc-99d4-3c6a7a82c7b0"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"Pandas_Profiling_Test","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":7032705}},"nbformat":4,"nbformat_minor":0}