-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpost_analysis.py
433 lines (368 loc) · 17.3 KB
/
post_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
"""
"""
#%%
## Import the libraries
import rasterio
import geopandas as gpd
from shapely.geometry import box
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date, timedelta
import seaborn as sns
from io import StringIO
import streamlit as st
import plotly.express as px
import contextily as ctx
from matplotlib.colors import LinearSegmentedColormap
import streamlit as st
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
import os
import glob
import ee
import geemap
from tqdm import tqdm # Import tqdm for the progress bar
from rasterio.plot import show
#import geemap.colormaps as cm
import folium
from matplotlib.dates import DateFormatter, DayLocator
from branca.colormap import LinearColormap
import altair as alt
import geemap.foliumap as geema
#%%
#__________________________________________________POST ANALYSIS______________________________________________________________________________________
# Title of the Streamlit App
st.title("Paddy Flooding Detection using Sentinel 2 Analysis (2019-2024)")
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
# Constants
START_PLANTING = 46 # 15 Feb
END_PLANTING = 74 # 15 Mar
START_HARVESTING = 186 # 5 Jul
END_HARVESTING = 259 # 16 Sep
def read_github_csv(url):
"""Read CSV from GitHub URL."""
raw_url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
return pd.read_csv(raw_url)
def process_rs_data(df):
rs_df = df.filter(regex=('\d{4}-?\d{2}-?\d{2}$'))
area_rs = rs_df.sum(axis=0)
rs_df_combined = pd.DataFrame({
'Time': area_rs.index,
'Area(ha)': area_rs.values
})
rs_df_combined['Year'] = rs_df_combined['Time'].str[:4]
rs_df_combined['Class'] = 'RS_' + rs_df_combined['Year']
rs_df_combined['Time'] = pd.to_datetime(rs_df_combined['Time'])
rs_df_combined['DOY'] = rs_df_combined['Time'].dt.dayofyear
return rs_df_combined
def plot_cumulative_area(df, title):
"""Plot cumulative flooded area."""
years = df['Time'].dt.year.unique()
fig, ax = plt.subplots(figsize=(10, 6))
for year in years:
year_df = df[df['Time'].dt.year == year]
ax.plot(year_df['DOY'], year_df['Area(ha)'], marker='o', linestyle='-', label=f'RS Area {year}')
ax.axvline(START_PLANTING, color='blue', linestyle='--', label='Start Planting (15 Feb)')
ax.axvline(END_PLANTING, color='green', linestyle='--', label='End Planting (15 Mar)')
ax.axvline(START_HARVESTING, color='orange', linestyle='--', label='Start Harvesting (5 Jul)')
ax.axvline(END_HARVESTING, color='red', linestyle='--', label='End Harvesting (16 Sep)')
ax.set_title(title)
ax.set_xlabel('Day of Year (DOY)')
ax.set_ylabel('Area (ha)')
ax.grid(True)
ax.legend()
return fig
def process_data(urls, title):
"""Process and plot data from a list of URLs."""
dataframes = []
for url in urls:
try:
df = read_github_csv(url).drop(columns=['flooding_date'], errors='ignore')
dataframes.append(df)
except Exception as e:
st.error(f"Error reading {url}: {e}")
if dataframes:
combined_df = pd.concat(dataframes, axis=1)
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
processed_df = process_rs_data(combined_df)
fig = plot_cumulative_area(processed_df, title)
col1, col2 = st.columns([3, 1])
col1.pyplot(fig)
col2.subheader("Data Sample")
col2.write(processed_df.head(20))
else:
st.warning(f"No {title} data available.")
# Dagana Flooding Data
st.header("**1. Dagana Plots**")
dagana_urls = [f'https://github.com/ICRISAT-Senegal/Remote-sensing/blob/main/flooding_data_{year}.csv' for year in range(2019, 2025)]
process_data(dagana_urls, "2019-2024 Cumulative Flooded Areas using Dagana Plots")
# agCelerant Data
st.header("**2. agCelerant Plots**")
agcelerant_urls = [f'https://github.com/ICRISAT-Senegal/Remote-sensing/blob/main/combined_flooding_data_{year}.csv' for year in range(2019, 2025)]
process_data(agcelerant_urls, "2019-2024 Cumulative Flooded Areas using agCelerant Plots")
#___________________________________________________________credit information________________________-
# Header for Credit Information
st.header("3. Credit Information")
# Initial Hypotheses
st.subheader("Initial Hypotheses")
st.markdown("""
- **agCelerant Data**:
- Increase in flooding → Early promise of credit.
- Decrease in flooding → Delay in the promise of credit.
""")
# Learnings from Data Exploration
st.subheader("Learnings from Exploration of the Data")
st.subheader("First Spike Hypothesis")
st.markdown("""
- Some GIEs might have internal financial reserves that are used.
- Some GIEs have good creditworthiness and could use this to get a loan from the bank.
""")
st.subheader("Second Spike Hypothesis")
st.markdown("""
- Reaction by the second category of farmers (those who lack internal reserves and thin file GIEs) to the promise of credit.
- Or the same group has learned about the decisions of the banks on credit issuance.
""")
st.subheader("Early Flooding Hypothesis")
st.markdown("""
- Early flooding explains/reflects the credit disbursement process.
- Everyone starts the cropping cycle without any payment. The cycle is based on promises or commitments by the GIE to get credit.
- Some unions have reserves to start planting before funds are released.
- The early start does not necessarily mean the money has been released yet.
""")
# New Hypothesis
st.subheader("New Hypothesis")
st.markdown("""
- **Creditworthiness and Reserves of GIE Explain the Monotonic Pattern of Flooding**
- Does the absence of a plateau imply reserves?
- Is there a significant difference between the lengths of the plateaus? Does a significant difference in the length of the plateau reflect different access to credit?
- Extract the date of the credit committee for the Dry hot seasons from 2017-2024 years.
- Explore the topology of GIE.
""")
# Action Points
st.subheader("Action Points")
st.markdown("""
- Consider **quantitative data analysis**: This approach is more process-based and accumulates evidence from many points to understand how the accumulation of events (like floods) reflects credit disbursement timelines.
- Zoom in to analyze **GIEs**:
- See their performance in terms of their decision to go into production.
- The commitment of credit & the farmers' response.
- Extract boundaries and calculate the total areas for confirmation with "Soule".
- Investigate the date of credit committee meetings and decisions for each year.
- Investigate when money starts flowing for them (GIE).
""")
st.markdown("General Information: The dataset contains 4,608 rows and 258 columns. "
"The `parcel_cod` column has 596 unique values, indicating multiple parcels. "
"The `operating_account` column contains 163 unique types of operations, "
"with 'Assurance CNAAS' being the most frequent. "
"There are missing data in `credit_auth_date` and `credit_exec_date`.")
#_____________________________________________________________uuuuuuuuuuuuuu____________________________
file_url ='https://github.com/ICRISAT-Senegal/Remote-sensing/blob/main/merged_cr_rs_data.csv'
#read_github_csv(url)
combined_df_credit = read_github_csv(file_url)#pd.read_csv(file, index_col=0,na_values=np.nan)
st.markdown("**Sample of the Credit Data:**")
st.dataframe(combined_df_credit.head(20))
#_____________________________________GIE analysis________________________________________-_______________________________
st.header("4. GIE Analysis")
# Load the merged data
data = combined_df_credit.copy()
# Convert None to empty strings
data = data.fillna('')
# Filter out rows with empty strings in the specified columns
has_credit_details = data[
(data['credit_req_date'] != '') &
(data['credit_auth_date'] != '') &
(data['credit_exec_date'] != '')
]
#data = data.replace({None: np.nan})
# Move filters to the sidebar
#st.sidebar.header("GIE Based Analysis-Has")
# Filter GIEs with all credit details (credit_req_date, credit_auth_date, credit_exec_date)
#has_credit_details = data.dropna(subset=['credit_req_date', 'credit_auth_date', 'credit_exec_date'],how='all')
gies_with_credit_details = data['gie_name'].unique()
#%%
# Display number of all unique GIEs
all_gies = data['gie_name'].unique()
st.write(f"- Total number of GIEs available for analysis: {len(all_gies)}")
# Filter GIEs with all credit details (credit_req_date, credit_auth_date, credit_exec_date)
#has_credit_details = data.dropna(subset=['credit_req_date', 'credit_auth_date', 'credit_exec_date'])
gies_with_credit_details = has_credit_details['gie_name'].unique()
# Display number of GIEs with complete credit details
st.write(f"- Number of GIEs with complete credit details: {len(gies_with_credit_details)}")
st.write('-Time to execution: the number of days from credit request to execution')
# Select GIE for analysis in the sidebar
selected_gie = st.selectbox('Select GIE:', has_credit_details['gie_name'].unique())
st.markdown(f"**Details for GIE: {selected_gie}**")
filtered_data = has_credit_details[has_credit_details['gie_name'] == selected_gie]
# Convert the 'credit_req_date', 'credit_auth_date', and 'credit_exec_date' columns to datetime for analysis
filtered_data['credit_req_date'] = pd.to_datetime(data['credit_req_date'], errors='coerce')
filtered_data['credit_auth_date'] = pd.to_datetime(data['credit_auth_date'], errors='coerce')
filtered_data['credit_exec_date'] = pd.to_datetime(data['credit_exec_date'], errors='coerce')
# Calculate the time between request and execution
filtered_data['time_to_execution'] = (filtered_data['credit_exec_date'] - filtered_data['credit_req_date']).dt.days
# Display basic statistics for the time to execution
time_to_execution_stats = filtered_data['time_to_execution'].describe()
# Plotting a histogram of time to execution
fig2 = plt.figure(figsize=(10, 6))
plt.hist(filtered_data['time_to_execution'].dropna(), bins=20, color='skyblue', edgecolor='black')
plt.title(f'Distribution of Time to Execution for Credit Requests for GIE: {selected_gie}')
plt.xlabel('Days from Credit Request to Execution')
plt.ylabel('Frequency')
plt.grid(True)
# Show the plot
plt.show()
st.pyplot(fig2)
# Process RS data (you can define this function)
gie_rs_df = process_rs_data(filtered_data)
# Plotting the Cumulative Area for the selected GIE
years = gie_rs_df['Time'].dt.year.unique()
fig, ax = plt.subplots(figsize=(10, 6))
for year in years:
year_df = gie_rs_df[gie_rs_df['Time'].dt.year == year]
ax.plot(year_df['DOY'], year_df['Area(ha)'], marker='o', linestyle='-', label=f'RS Area {year}')
start_planting = datetime.strptime('2023-02-15', '%Y-%m-%d').timetuple().tm_yday
end_planting = datetime.strptime('2023-03-15', '%Y-%m-%d').timetuple().tm_yday
start_harvesting = datetime.strptime('2023-07-05', '%Y-%m-%d').timetuple().tm_yday
end_harvesting = datetime.strptime('2023-09-16', '%Y-%m-%d').timetuple().tm_yday
ax.axvline(start_planting, color='blue', linestyle='--', label='Start Planting (15 Feb)')
ax.axvline(end_planting, color='green', linestyle='--', label='End Planting (15 Mar)')
ax.axvline(start_harvesting, color='orange', linestyle='--', label='Start Harvesting (5 Jul)')
ax.axvline(end_harvesting, color='red', linestyle='--', label='End Harvesting (16 Sep)')
ax.set_title(f'2019-2024 Cumulative Flooded Areas for GIE: {selected_gie}')
ax.set_xlabel('Day of Year (DOY)')
ax.set_ylabel('Area (ha)')
ax.grid(True)
ax.legend()
st.pyplot(fig)
# Bar chart for Irrigation Type
st.subheader(f"Count of Irrigation Types for: {selected_gie}")
irrigation_type_counts = filtered_data['irrigation_type'].value_counts()
fig_irrigation, ax_irrigation = plt.subplots(figsize=(8, 4))
ax_irrigation.bar(irrigation_type_counts.index, irrigation_type_counts.values, color='skyblue')
ax_irrigation.set_title('Irrigation Type Count')
ax_irrigation.set_xlabel('Irrigation Type')
ax_irrigation.set_ylabel('Count')
ax_irrigation.grid(True)
st.pyplot(fig_irrigation)
st.title(f'Operations accounts for {selected_gie}')
st.write(' Visualizes the credit request, authorization, and execution process for different operating accounts ( eg gasoil irrigation) in a timeline format. This allows you to easily track the progress of each operating account’s credit process over time.')
# Prepare the data for timeline visualization
timeline_data = []
for idx, row in filtered_data.iterrows():
timeline_data.append(dict(Task=row['operating_account'], Start=row['credit_req_date'], Finish=row['credit_auth_date'], Stage='Credit Requested'))
timeline_data.append(dict(Task=row['operating_account'], Start=row['credit_auth_date'], Finish=row['credit_exec_date'], Stage='Credit Authorized'))
timeline_data.append(dict(Task=row['operating_account'], Start=row['credit_exec_date'], Finish=row['credit_exec_date'], Stage='Credit Executed'))
# Convert the timeline data into a DataFrame
timeline_df = pd.DataFrame(timeline_data)
# Custom color map for stages
custom_colors = {
'Credit Requested': 'dodgerblue',
'Credit Authorized': 'orange',
'Credit Executed': 'green'
}
# Create a Gantt-style timeline using Plotly with customizations
fig_timeline = px.timeline(
timeline_df,
x_start="Start",
x_end="Finish",
y="Task",
color="Stage",
color_discrete_map=custom_colors, # Use custom colors for each stage
title=f"Credit Request, Authorization, and Execution Timeline for GIE: {selected_gie}",
hover_data={
'Start': '|%B %d, %Y', # Customize hover date format
'Finish': '|%B %d, %Y', # Customize hover date format
'Stage': True,
'Task': True
},
labels={"Task": "Operation Account", "Stage": "Credit Stage"}
)
# Customizing the layout
fig_timeline.update_layout(
xaxis_title="Date",
yaxis_title="Operation Account",
showlegend=True,
height=600,
margin=dict(l=100, r=50, t=50, b=50), # Adjust margins for readability
font=dict(family="Arial, sans-serif", size=12) # Customize font
)
# Improve x-axis readability
fig_timeline.update_xaxes(
tickformat="%b %d, %Y", # Date format on the x-axis
tickangle=45, # Rotate x-axis labels
nticks=20 # Control number of ticks
)
# Display the timeline
st.plotly_chart(fig_timeline)
# Assuming 'melted_data' is your DataFrame and it has a 'geometry' column
# Load GeoDataFrame (GIE credit and RS data with geometries)
st.dataframe(filtered_data)
#________________________________________________________PPPP___________________________________
# Prediction
st.header('4. Estimation of the Flooded area using growth Models')
#st.subheader('**Logistics groth model**')
# Title and description
st.title("Growth Curve: Logistic")
# Equation
st.markdown("### Equation:")
st.latex(r"P(t) = \frac{K}{1 + \left( \frac{K - P_0}{P_0} \right) e^{-rt}}")
# Explanation of the terms
st.markdown("""
Where:
- **P(t)**: Population or size at time **t**,
- **K**: Carrying capacity (the maximum size the population can reach),
- **P₀**: Initial population or size,
- **r**: Growth rate,
- **t**: Time.
""")
# Shape explanation
st.markdown("""
### Shape:
The logistic curve is S-shaped (sigmoidal). It starts with an exponential growth phase,
slows down as the population approaches the carrying capacity, and eventually levels off at the carrying capacity.
""")
# Growth dynamics
st.markdown("""
### Growth Dynamics:
- **Initial Phase**: Rapid growth (exponential).
- **Middle Phase**: Growth rate decreases as resources become limited.
- **Final Phase**: Growth slows down and asymptotically approaches the carrying capacity, **K**.
""")
#plot the prediction and the data
def process_rs_data(dff):
rs_df = dff.filter(regex=('\d{4}-?\d{2}-?\d{2}$')) # Date columns
area_rs = rs_df.sum(axis=0) # Summing values row-wise
rs_df_combined = pd.DataFrame()
rs_df_combined['Time'] = list(area_rs.index)
rs_df_combined['Area(ha)'] = list(area_rs.values)
rs_df_combined['Year'] = rs_df_combined['Time'].str[:4]
rs_df_combined['Class'] = 'RS_' + rs_df_combined['Year']
rs_df_combined['Time'] = pd.to_datetime(rs_df_combined['Time'])
rs_df_combined['DOY'] = rs_df_combined['Time'].apply(lambda x: x.timetuple().tm_yday)
return rs_df_combined
# Process both datasets
dag_rs_df = process_rs_data(read_github_csv('https://github.com/ICRISAT-Senegal/Remote-sensing/blob/main/flooding_data_2024.csv'))
file_log ='https://github.com/ICRISAT-Senegal/Remote-sensing/blob/main/prediction_growth_ts_first_attempt.csv'
data_log = read_github_csv(file_log)
data_log['Time'] = pd.to_datetime(data_log['time_t'])
data_log['Area(ha)'] = data_log['area_t']
#data_log['Class'] = 'log_2024'
#data_log['Time'] = pd.to_datetime(data_log['Time'])
data_log['DOY'] = data_log['Time'].apply(lambda x: x.timetuple().tm_yday)
fig, ax = plt.subplots(figsize=(10, 6))
combined_df_g = pd.concat([dag_rs_df,data_log],axis = 0)
# Plot each class in the 'Class' column
for cls in combined_df_g['Class'].unique():
class_data = combined_df_g[combined_df_g['Class'] == cls]
ax.plot(class_data['DOY'], class_data['Area(ha)'], marker='x', linestyle='--', label=cls)
# Customize plot
ax.set_title(f'Cumulative Flooded Areas {year}', fontsize=10)
ax.set_xlabel('Day of Year (DOY)')
ax.set_ylabel('Cumulative Flooded Area (ha)')
ax.legend(title='Classes')
ax.grid(True)
plt.show()
st.pyplot(fig)