import os
import logging
from google.cloud import storage
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] \
= "data/key/healthdatanexus-review-d7f7ef30e829.json"
import pandas as pd
import numpy as np
import plotly.express as px
import warnings
test_demo_dat = pd.read_csv(
'gs://gem-data/GIM/pre-processed/test/test_demographics.csv')
train_demo_dat = pd.read_csv(
'gs://gem-data/GIM/pre-processed/train/train_demographics.csv')
valid_demo_dat = pd.read_csv(
'gs://gem-data/GIM/pre-processed/valid/valid_demographics.csv')
test_encount_dat = pd.read_csv(
'gs://gem-data/GIM/pre-processed/test/test_encounters.csv')
train_encount_dat = pd.read_csv(
'gs://gem-data/GIM/pre-processed/train/train_encounters.csv')
valid_encount_dat = pd.read_csv(
'gs://gem-data/GIM/pre-processed/valid/valid_encounters.csv')
We begin by loading in the data. Once this is done, and we have the testing, training, and validation datasets for both demographic and encounter data, we can join these two datasets to get demographics on the patient level rather than the encounter level.
## Get encounter data, dropping any duplicates
warnings.filterwarnings("ignore")
encount_dat = test_encount_dat.append(
train_encount_dat).append(valid_encount_dat)
num_enc = len(encount_dat[['ENCOUNTER_NUM']].drop_duplicates())
demo_dat = test_demo_dat.append(train_demo_dat).append(valid_demo_dat)
num_dem = len(encount_dat[['ENCOUNTER_NUM']].drop_duplicates())
encount_dat_pt = encount_dat[['ENCOUNTER_NUM','PATIENT_DK']]
num_enc = len(encount_dat_pt['ENCOUNTER_NUM'])
patient_dat = test_demo_dat.append(
train_demo_dat).append(valid_demo_dat)
# Join with patient data
patient_dat = patient_dat.merge(
encount_dat_pt, on='ENCOUNTER_NUM',how='left')
patient_sex = patient_dat[['PATIENT_DK','sex']]
patient_sex_unique = patient_gender.drop_duplicates()
Once the data is prepped, we can group on the sex field to get patient counts by sex.
patient_sex = patient_dat[['PATIENT_DK','sex']]
patient_sex_unique = patient_gender.drop_duplicates()
sex = patient_sex_unique[['PATIENT_DK','sex']].groupby(
'sex',as_index=False).count()
sex['sex'] = ['M','F']
sex.columns = ['Sex','Count']
sex
Sex | Count | |
---|---|---|
0 | M | 7545 |
1 | F | 5969 |
Finally, we use the plotly package to plot the results.
fig_sex = px.bar(gender, x="Sex", y="Count", color = "Sex",
labels={"Sex" : "Sex",
"Count" : "Number of Patients"})
fig_sex.update_layout(paper_bgcolor="#f9f9f9")
fig_sex.update_layout(showlegend=False)
fig_sex