-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCorrelation-Analysis.py
84 lines (65 loc) · 3.11 KB
/
Correlation-Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 15 11:58:04 2020
@author: jstrait
This project will take NLP data including indicators for the presence of certain
concepts, segment the data by relevant variables, and produce correlation data
between the presence of a concept and whether or not the claim was referred.
This is a condensed and data-free version of a correlation analysis script that
I used for a project at MetLife.
"""
## Front Matter
import pandas as pd
from pandas import *
from scipy import stats
from scipy.stats.stats import pearsonr
## Input data
data = pd.read_excel("insert file name here.xlsx")
## Data processing
# View only ended STD claims
data = data[data['CLM_STATUS'] == 'ENDED'].copy()
data = data[data['PROD_TYPE'] == 'STD'].copy()
# Subset data by duration
short = data[data['SHORT_LONG'] == 'SHORT']
long = data[data['SHORT_LONG'] == 'LONG']
## Identify entities and prepare data for correlation analysis
correlations = {}
col_list = data.columns.tolist()
remove_cols = {'CLM_NUM_CD', 'SYS_DATA_TYP_CD', 'action_text_mapped_concepts', 'PROD_TYPE', 'CLM_STATUS', 'ACNT_DURATION', 'SHORT_LONG'}
var_list = [ele for ele in col_list if ele not in remove_cols]
## Calculate correlations for unsegmented data
# Calculate correlations based on referral indicator
col_a = "REF_IND"
for entity in var_list:
correlations[col_a + '__' + entity] = stats.pearsonr(data[col_a], data[entity])
data_referral = pd.DataFrame.from_dict(correlations, orient='index')
data_referral.columns = ['ref_pearson_r', 'ref_p-value']
## Calculate correlations for duration segments
correlations = {}
# The metrics data frame tracks how many occurrences of a concept there are in the dataset
short_metrics = {}
for entity in var_list:
correlations[col_a + '__' + entity] = stats.pearsonr(short[col_a], short[entity])
short_ref = pd.DataFrame.from_dict(correlations, orient='index')
short_metrics['{}'.format(entity)] = sum(short[entity])
short_ref.columns = ['short_ref_pearson_r', 'short_ref_p-value']
short_metrics = pd.DataFrame.from_dict(short_metrics, orient = 'index')
short_metrics.columns = ['short_count']
correlations = {}
long_metrics = {}
for entity in var_list:
correlations[col_a + '__' + entity] = stats.pearsonr(long[col_a], long[entity])
long_ref = pd.DataFrame.from_dict(correlations, orient='index')
long_metrics['{}'.format(entity)] = sum(long[entity])
long_ref.columns = ['long_ref_pearson_r', 'long_ref_p-value']
long_metrics = pd.DataFrame.from_dict(long_metrics, orient = 'index')
long_metrics.columns = ['long_count']
metrics_table = pd.merge(short_metrics, long_metrics, left_index = True, right_index = True)
## Create table
data_dur_ref_table = pd.merge(data_referral, short_ref, left_index=True, right_index=True)
data_dur_ref_table = pd.merge(data_dur_ref_table, long_ref, left_index=True, right_index=True)
## Export data to an Excel file
writer = pd.ExcelWriter("desired file location.xlsx")
data_dur_ref_table.to_excel(writer, "Duration Segments")
metrics_table.to_excel(writer, "Metrics")
writer.save()