import pandas as pd
import numpy as np
df = pd.DataFrame(pd.read_csv('doc_reports-1.csv'))
df
| id | user_id | result | visual_authenticity_result | image_integrity_result | face_detection_result | image_quality_result | created_at | supported_document_result | conclusive_document_quality_result | colour_picture_result | data_validation_result | data_consistency_result | data_comparison_result | attempt_id | police_record_result | compromised_document_result | properties | sub_result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | ab23fae164e34af0a1ad1423ce9fd9f0 | consider | consider | clear | clear | clear | 2017-06-20T23:12:57Z | clear | NaN | NaN | clear | clear | NaN | 050a0596de424fab83c433eaa18b3f8d | clear | NaN | {'gender': 'Male', 'nationality': 'IRL', 'docu... | caution |
| 1 | 1.0 | 15a84e8951254011b47412fa4e8f65b8 | clear | clear | clear | clear | clear | 2017-06-20T23:16:04Z | clear | NaN | NaN | clear | NaN | NaN | f69c1e5f45a64e50a26740b9bfb978b7 | clear | NaN | {'gender': 'Female', 'document_type': 'driving... | clear |
| 2 | 2.0 | ffb82fda52b041e4b9af9cb4ef298c85 | clear | clear | clear | clear | clear | 2017-06-20T17:59:49Z | clear | NaN | NaN | clear | clear | NaN | f9f84f3055714d8e8f7419dc984d1769 | clear | NaN | {'gender': 'Male', 'nationality': 'ITA', 'docu... | clear |
| 3 | 3.0 | bd4a8b3e3601427e88aa1d9eab9f4290 | clear | clear | clear | clear | clear | 2017-06-20T17:59:38Z | clear | NaN | NaN | clear | clear | NaN | 10a54a1ecf794404be959e030f11fef6 | clear | NaN | {'gender': 'Male', 'issuing_date': '2007-08', ... | clear |
| 4 | 4.0 | f52ad1c7e69543a9940c3e7f8ed28a39 | clear | clear | clear | clear | clear | 2017-06-20T18:08:09Z | clear | NaN | NaN | clear | clear | NaN | 1f320d1d07de493292b7e0d5ebfb1cb9 | clear | NaN | {'gender': 'Male', 'nationality': 'POL', 'docu... | clear |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16378 | 17511.0 | 7049eba7d145462282bd9f19a834e43a | clear | clear | clear | clear | clear | 2017-10-11T21:54:34Z | clear | clear | clear | clear | clear | NaN | 4cd2b6b70f354904852804f69d4e40c3 | clear | clear | {'gender': 'Male', 'nationality': 'GBR', 'docu... | clear |
| 16379 | 17512.0 | 41c6441d28c646168e948c3b3764ddbf | clear | clear | clear | clear | clear | 2017-10-11T21:55:26Z | clear | clear | clear | clear | NaN | NaN | 709462b908fe4017a67ac049036a7553 | clear | clear | {'issuing_date': '2017-06-17', 'document_type'... | clear |
| 16380 | 17513.0 | 77ecd7d832314de2b9eafd607ec80e0d | clear | clear | clear | clear | clear | 2017-10-11T21:55:12Z | clear | clear | clear | clear | clear | NaN | 999dcabb1cdb4deabcce50518555d72d | clear | clear | {'gender': 'Female', 'nationality': 'GBR', 'do... | clear |
| 16381 | 17514.0 | d4af182a07e7428f973a722147baa0be | clear | clear | clear | clear | clear | 2017-10-11T21:56:45Z | clear | clear | clear | clear | clear | NaN | 956a86269cb548a7a681f1a2ab360ae9 | clear | clear | {'gender': 'Male', 'nationality': 'GBR', 'docu... | clear |
| 16382 | 17515.0 | 4e11eeeaeac5415ebf73db9f64fcc05c | clear | clear | clear | clear | clear | 2017-10-11T21:56:19Z | clear | clear | clear | clear | clear | NaN | cd5b5d5cf3ab4701a947e43e534f86f3 | clear | clear | {'gender': 'Male', 'nationality': 'PRT', 'docu... | clear |
16383 rows × 19 columns
prop_list = list(pd.Series(df['properties']))
cols = ['gender', 'nationality', 'document_type', 'date_of_expiry', 'issuing_country']
gen = []
nat = []
doc = []
dat = []
isc = []
del_rows = []
for i in range(len(prop_list)):
if len(prop_list[i]) > 2:
d = eval(prop_list[i])
try:
gen.append(d['gender'])
except KeyError:
gen.append(np.nan)
try:
nat.append(d['nationality'])
except KeyError:
nat.append(np.nan)
try:
doc.append(d['document_type'])
except KeyError:
doc.append(np.nan)
try:
dat.append(d['date_of_expiry'])
except KeyError:
dat.append(np.nan)
try:
isc.append(d['issuing_country'])
except KeyError:
isc.append(np.nan)
else:
del_rows.append(i)
pass
data = [gen, nat, doc, dat, isc]
data = np.transpose(data)
new_df = pd.DataFrame(data, columns=cols)
df.drop(labels=del_rows, axis=0, inplace=True)
new_df['id'] = df['id'].values
new_df = new_df.reindex(columns=['id', 'gender', 'nationality', 'document_type', 'date_of_expiry', 'issuing_country'])
cols = []
for val in df.columns:
if val != 'result':
cols.append(val)
cols.append('result')
df = df.reindex(columns=cols)
c_df = pd.merge(new_df, df, on='id')
c_df.drop(['user_id', 'created_at', 'attempt_id', 'properties', 'date_of_expiry'], axis=1, inplace=True)
c_df.isnull().sum()
id 0 gender 3061 nationality 6904 document_type 1 issuing_country 0 visual_authenticity_result 2 image_integrity_result 0 face_detection_result 3 image_quality_result 0 supported_document_result 0 conclusive_document_quality_result 885 colour_picture_result 882 data_validation_result 698 data_consistency_result 5246 data_comparison_result 14389 police_record_result 517 compromised_document_result 1672 sub_result 0 result 0 dtype: int64
c_df.drop(['data_comparison_result', 'data_consistency_result', 'nationality'], axis=1, inplace=True)
c_df.dropna(inplace=True)
c_df.isnull().sum()
id 0 gender 0 document_type 0 issuing_country 0 visual_authenticity_result 0 image_integrity_result 0 face_detection_result 0 image_quality_result 0 supported_document_result 0 conclusive_document_quality_result 0 colour_picture_result 0 data_validation_result 0 police_record_result 0 compromised_document_result 0 sub_result 0 result 0 dtype: int64
c_df.reset_index(inplace=True)
c_df.drop(['index'], axis=1, inplace=True)
c_df
| id | gender | document_type | issuing_country | visual_authenticity_result | image_integrity_result | face_detection_result | image_quality_result | supported_document_result | conclusive_document_quality_result | colour_picture_result | data_validation_result | police_record_result | compromised_document_result | sub_result | result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 718.0 | Male | passport | GBR | clear | consider | clear | clear | clear | consider | clear | consider | clear | clear | caution | consider |
| 1 | 720.0 | Female | driving_licence | GBR | clear | consider | clear | clear | clear | consider | clear | clear | clear | clear | caution | consider |
| 2 | 723.0 | Male | driving_licence | GBR | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear |
| 3 | 725.0 | Female | passport | NZL | clear | consider | clear | clear | clear | consider | clear | clear | clear | clear | caution | consider |
| 4 | 729.0 | Male | national_identity_card | LTU | clear | consider | clear | clear | clear | consider | clear | clear | clear | clear | caution | consider |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9820 | 17510.0 | Male | passport | GBR | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear |
| 9821 | 17511.0 | Male | passport | GBR | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear |
| 9822 | 17513.0 | Female | passport | GBR | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear |
| 9823 | 17514.0 | Male | passport | GBR | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear |
| 9824 | 17515.0 | Male | national_identity_card | PRT | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear | clear |
9825 rows × 16 columns
from sklearn import preprocessing
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Input In [15], in <module> ----> 1 from sklearn import preprocessing ModuleNotFoundError: No module named 'sklearn'
encoder = preprocessing.OrdinalEncoder( dtype=int)
c_df.iloc[:, 1:] = encoder.fit_transform(c_df.iloc[:, 1:])
c_df
| id | gender | document_type | issuing_country | visual_authenticity_result | image_integrity_result | face_detection_result | image_quality_result | supported_document_result | conclusive_document_quality_result | colour_picture_result | data_validation_result | police_record_result | compromised_document_result | sub_result | result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 718.0 | 1 | 2 | 35 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| 1 | 720.0 | 0 | 0 | 35 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 723.0 | 1 | 0 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 725.0 | 0 | 2 | 69 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 729.0 | 1 | 1 | 57 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9820 | 17510.0 | 1 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 9821 | 17511.0 | 1 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 9822 | 17513.0 | 0 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 9823 | 17514.0 | 1 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 9824 | 17515.0 | 1 | 1 | 74 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
9825 rows × 16 columns
c_df['sub_result'].replace({0: 1, 1: 0}, inplace=True)
c_df
| id | gender | document_type | issuing_country | visual_authenticity_result | image_integrity_result | face_detection_result | image_quality_result | supported_document_result | conclusive_document_quality_result | colour_picture_result | data_validation_result | police_record_result | compromised_document_result | sub_result | result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 718.0 | 1 | 2 | 35 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 |
| 1 | 720.0 | 0 | 0 | 35 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2 | 723.0 | 1 | 0 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 725.0 | 0 | 2 | 69 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
| 4 | 729.0 | 1 | 1 | 57 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9820 | 17510.0 | 1 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9821 | 17511.0 | 1 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9822 | 17513.0 | 0 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9823 | 17514.0 | 1 | 2 | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9824 | 17515.0 | 1 | 1 | 74 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9825 rows × 16 columns
cor = c_df.corr()
cor.style.background_gradient(cmap='coolwarm', axis=None)
| id | gender | document_type | issuing_country | visual_authenticity_result | image_integrity_result | face_detection_result | image_quality_result | supported_document_result | conclusive_document_quality_result | colour_picture_result | data_validation_result | police_record_result | compromised_document_result | sub_result | result | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 1.000000 | 0.030311 | -0.064577 | -0.076511 | -0.018751 | 0.154250 | 0.036549 | nan | nan | 0.156256 | -0.023275 | -0.072004 | nan | 0.021647 | 0.101027 | 0.127531 |
| gender | 0.030311 | 1.000000 | -0.023036 | -0.015933 | 0.038617 | -0.020222 | 0.011487 | nan | nan | -0.020701 | 0.010530 | 0.010910 | nan | 0.013055 | 0.003406 | -0.006583 |
| document_type | -0.064577 | -0.023036 | 1.000000 | 0.088797 | 0.040532 | 0.061854 | -0.012433 | nan | nan | 0.061130 | 0.017443 | -0.024664 | nan | -0.029587 | 0.065538 | 0.061017 |
| issuing_country | -0.076511 | -0.015933 | 0.088797 | 1.000000 | -0.004406 | -0.018619 | 0.010115 | nan | nan | -0.018232 | -0.002777 | -0.036764 | nan | -0.009010 | -0.032152 | -0.027826 |
| visual_authenticity_result | -0.018751 | 0.038617 | 0.040532 | -0.004406 | 1.000000 | -0.005935 | 0.419638 | nan | nan | -0.020149 | 0.227302 | 0.011478 | nan | -0.002454 | 0.455263 | 0.252002 |
| image_integrity_result | 0.154250 | -0.020222 | 0.061854 | -0.018619 | -0.005935 | 1.000000 | -0.013605 | nan | nan | 0.997563 | 0.061537 | 0.008584 | nan | -0.009065 | 0.827921 | 0.930830 |
| face_detection_result | 0.036549 | 0.011487 | -0.012433 | 0.010115 | 0.419638 | -0.013605 | 1.000000 | nan | nan | -0.013506 | -0.001881 | -0.006525 | nan | -0.001030 | 0.092099 | 0.105749 |
| image_quality_result | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
| supported_document_result | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
| conclusive_document_quality_result | 0.156256 | -0.020701 | 0.061130 | -0.018232 | -0.020149 | 0.997563 | -0.013506 | nan | nan | 1.000000 | -0.000876 | 0.006537 | nan | -0.009043 | 0.822829 | 0.928561 |
| colour_picture_result | -0.023275 | 0.010530 | 0.017443 | -0.002777 | 0.227302 | 0.061537 | -0.001881 | nan | nan | -0.000876 | 1.000000 | 0.025646 | nan | -0.000558 | 0.090083 | 0.057281 |
| data_validation_result | -0.072004 | 0.010910 | -0.024664 | -0.036764 | 0.011478 | 0.008584 | -0.006525 | nan | nan | 0.006537 | 0.025646 | 1.000000 | nan | -0.001935 | 0.227737 | 0.198704 |
| police_record_result | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
| compromised_document_result | 0.021647 | 0.013055 | -0.029587 | -0.009010 | -0.002454 | -0.009065 | -0.001030 | nan | nan | -0.009043 | -0.000558 | -0.001935 | nan | 1.000000 | 0.063996 | 0.031363 |
| sub_result | 0.101027 | 0.003406 | 0.065538 | -0.032152 | 0.455263 | 0.827921 | 0.092099 | nan | nan | 0.822829 | 0.090083 | 0.227737 | nan | 0.063996 | 1.000000 | 0.961849 |
| result | 0.127531 | -0.006583 | 0.061017 | -0.027826 | 0.252002 | 0.930830 | 0.105749 | nan | nan | 0.928561 | 0.057281 | 0.198704 | nan | 0.031363 | 0.961849 | 1.000000 |
final_data = c_df[['image_integrity_result', 'conclusive_document_quality_result', 'visual_authenticity_result', 'sub_result', 'result']]
final_data
| image_integrity_result | conclusive_document_quality_result | visual_authenticity_result | sub_result | result | |
|---|---|---|---|---|---|
| 0 | 1 | 1 | 0 | 1 | 1 |
| 1 | 1 | 1 | 0 | 1 | 1 |
| 2 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 1 | 0 | 1 | 1 |
| 4 | 1 | 1 | 0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... |
| 9820 | 0 | 0 | 0 | 0 | 0 |
| 9821 | 0 | 0 | 0 | 0 | 0 |
| 9822 | 0 | 0 | 0 | 0 | 0 |
| 9823 | 0 | 0 | 0 | 0 | 0 |
| 9824 | 0 | 0 | 0 | 0 | 0 |
9825 rows × 5 columns
1) Image Integrity Result
2) Conclusive Document Quality Result
3) Visual Authenticity Result
1) Gender
2) Country
3) Document Type
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc = {'figure.figsize':(15,8)})
sns.countplot(x="variable", hue="value", data=pd.melt(final_data))
plt.legend(labels=["c l e a r", "c o n s i d e r", "s u s p e c t e d"])
plt.show()