-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCohort_selection.py
90 lines (72 loc) · 3.98 KB
/
Cohort_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
import os
from .utils import get_demography, print_demo
# get parent directory of this file
script_dir = os.path.dirname(__file__)
# get root directory of this project (two levels up from this file)
root_dir = os.path.abspath(os.path.join(script_dir, os.pardir, os.pardir))
# MIMIC
df0 = pd.read_csv(os.path.join(root_dir, 'data', 'MIMIC_data.csv'))
print(len(df0), "Initial rows in extracted MIMIC\n")
demo0 = print_demo(get_demography(df0))
print(f"{len(df0)} sepsis stays \n({demo0})\n")
df1 = df0[df0.sepsis3 == 1]
print(f"Removed {len(df0) - len(df1)} stays without sepsis")
demo1 = print_demo(get_demography(df1))
print(f"{len(df1)} sepsis stays \n({demo1})\n")
df1['mv_24hr'] = np.where((df1['mech_vent_overall']== 1) & (df1['MV_init_offset_abs']<=1), 1, 0)
df1['vp_24hr'] = np.where((df1['vasopressor_overall']== 1) & (df1['VP_init_offset_abs']<=1), 1, 0)
df1['rrt_72hr'] = np.where((df1['rrt_overall']== 1) & (df1['RRT_init_offset_abs']<=3), 1, 0)
df2 = df1[df1.los_icu >= 1]
print(f"Removed {len(df1) - len(df2)} stays with less than 24 hours")
demo2 = print_demo(get_demography(df2))
print(f"{len(df2)} stays with sepsis and LoS > 24h \n({demo2})\n")
df3 = df2[df2.admission_age >= 18]
print(f"Removed {len(df2) - len(df3)} stays with non-adult patient")
demo3 = print_demo(get_demography(df3))
print(f"{len(df3)} stays with sepsis, lactate day 1, LoS > 24h, adult patient \n({demo3})\n")
df4 = df3.sort_values(by=["subject_id", "stay_id"], ascending=True).groupby(
'subject_id').apply(lambda group: group.iloc[0, 1:])
print(f"Removed {len(df3) - len(df4)} recurrent stays")
demo4 = print_demo(get_demography(df4))
print(f"{len(df4)} adults with sepsis, lactate day 1, LoS > 24h, adult patient, 1 stay per patient \n({demo4})\n")
cols_na = ['major_surgery', 'hypertension_present', 'heart_failure_present',
'copd_present', 'asthma_present', 'cad_present', 'ckd_stages',
'connective_disease', 'pneumonia', 'uti', 'biliary', 'skin', 'respiration',
'coagulation', 'cardiovascular', 'cns', 'liver']
for c in cols_na:
df4[c] = df4[c].fillna(0)
lab_ranges = {'po2_min': [0, 90, 1000],
'pco2_max': [0, 40, 200],
'ph_min': [5, 7.35, 10],
'lactate_max': [0, 1.05, 30],
'glucose_max': [0, 95, 2000],
'sodium_min': [0, 140, 160],
'potassium_max': [0, 3.5, 9.9],
'cortisol_min': [0, 20, 70],
'fibrinogen_min': [0, 200, 1000],
'inr_max': [0, 1.1, 10],
'resp_rate_mean': [0, 15, 50],
'heart_rate_mean': [0, 90, 250],
'mbp_mean': [0, 85, 200],
'temperature_mean': [32, 36.5, 45],
'spo2_mean': [0, 95, 100]
}
for lab in lab_ranges.keys():
df4[lab] = np.where(df4[lab] < lab_ranges[lab][0], 0, df4[lab])
df4[lab] = np.where(df4[lab] > lab_ranges[lab][2], 0, df4[lab])
df4[lab] = np.where(df4[lab] == 0, lab_ranges[lab][1], df4[lab])
df4[lab] = df4[lab].fillna(lab_ranges[lab][1])
df4['hemoglobin_min'] = df4['hemoglobin_min'].apply(lambda x: 0 if x < 3 else x)
df4['hemoglobin_min'] = df4['hemoglobin_min'].apply(lambda x: 0 if x > 30 else x)
df4['hemoglobin_min'] = df4['hemoglobin_min'].fillna(0)
df4['hemoglobin_min'] = df4.apply(lambda row: 12 if (row.hemoglobin_min == 0) \
& (row.sex_female == 1) \
else row.hemoglobin_min, axis=1)
df4['hemoglobin_min'] = df4.apply(lambda row: 13.5 if (row.hemoglobin_min == 0) \
& (row.sex_female == 0) \
else row.hemoglobin_min, axis=1)
df4['fluids_volume_norm_by_los_icu'] = df4['fluids_volume_norm_by_los_icu'].fillna(df4['fluids_volume_norm_by_los_icu'].mean())
print(f"df4 length after confounder imputation {len(df4)}")
#df4.to_csv(os.path.join(root_dir, 'data/cohorts', 'SCCM_cohort.csv'))