-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathADSB_preprocessing.py
159 lines (112 loc) · 5.36 KB
/
ADSB_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
import numpy as np
import json
import pickle
import warnings
warnings.filterwarnings('ignore')
def preprocess_adsb(row, max_measurements, n_meas_diff):
"""Performs the ads-b data preprocessing that includes:
- exploding the measurements JSON array,
- extracting the sensor, timestamp and RSSI information from an array of measurements,
- conducting timestamps synchronization,
- adding the sensor localization data,
- performing the feature extraction,
- filling missing values.
Parameters
----------
row: pandas.Series
Row of raw ads-b data
max_measurements: int
Maximum number of measurements to take into consideration
n_meas_diff: int
Number of measurements to be taken into account while calculating differences
Returns
-------
df: pandas.DataFrame
Dataframe of preprocessed ads-b data
"""
# Convert row to pandas dataframe
df = row.to_frame().transpose()
# Explode measurements data
meas = df.measurements.apply(lambda row: json.loads(row)[:max_measurements])
meas = meas.apply(pd.Series)
df.drop(columns='measurements', inplace=True)
df = pd.concat([df, meas], axis=1)
meas_list = []
for col in range(max_measurements):
try:
columns = ['sensor_{}'.format(col), 'tmp_{}'.format(col), 'RSSI_{}'.format(col)]
meas_list.append(pd.DataFrame(df[col].to_list(), columns=columns))
df.drop(columns=col, inplace=True)
except KeyError:
empty_cols = {'sensor_{}'.format(col) : np.NaN, 'tmp_{}'.format(col) : 0,
'RSSI_{}'.format(col) : 0}
meas_list.append(pd.DataFrame(empty_cols, index=[0]))
# Concatenate all columns
meas = pd.concat(meas_list, axis=1)
meas.reset_index(inplace=True)
df.reset_index(inplace=True)
df = pd.concat([df, meas], axis=1)
# df.astype({'tmp_{}'.format(i): int for i in range(max_measurements)})
# Replace tmp_0 with its absolute value
df['tmp_0'] = df['tmp_0'].abs().astype('int')
# Read sensors data
sensors = pd.read_csv('round2_training/round2/round2_sensors.csv')
# Read the synchronization coefficients
with open(r"coeff_dict.pickle", "rb") as output_file:
coeff_dict = pickle.load(output_file)
sensors.set_index('serial', inplace=True)
# Add the sensor localization data
for i in range(max_measurements):
df = df.join(sensors.loc[:, ['latitude', 'longitude', 'height']], on='sensor_{}'.format(i),
rsuffix='_{}'.format(i))
df.rename({'height': 'height_{}'.format(i)}, axis=1, inplace=True)
# Perform timestamp synchronization
corrected_tmp = {}
s1 = df.sensor_0
timeAtServer = df.timeAtServer
for i in range(1, max_measurements):
corrected_tmp.setdefault(i, [])
s2 = getattr(df, 'sensor_{}'.format(i))
tmp_2 = getattr(df, 'tmp_{}'.format(i))
if tmp_2[0] == 0:
corrected_tmp[i].append(0)
continue
# Return the correction coefficients (default=(0,0))
m, b = coeff_dict.get('{}_{}'.format(int(s1), int(s2)), (0, 0))
corr = m * timeAtServer + b
corrected_tmp[i].append(int(tmp_2 - corr))
for i in range(1, max_measurements):
df['tmp_{}'.format(i)] = pd.Series(corrected_tmp[i])
df.drop(columns=['id', 'numMeasurements'] + ['sensor_{}'.format(i) for i in range(max_measurements)], inplace=True)
# Feature extraction
# Calculate the timestamp differences
for col_1 in range(1, n_meas_diff):
for col_2 in range(col_1):
df['diff_{}_{}'.format(col_1, col_2)] = int(df['tmp_{}'.format(col_1)]) - int(df['tmp_{}'.format(col_2)])
# Calculate the mean latitude
df['mean_lat'] = df[[col for col in df.columns if str(col).startswith('latitude_')]].mean(axis=1)
# Calculate the mean longitude
df['mean_lon'] = df[[col for col in df.columns if str(col).startswith('longitude_')]].mean(axis=1)
# Mask NaN values
mask = ~np.isnan(df[[col for col in df.columns if str(col).startswith('latitude_')]])
# Calculate the weighted average of latitude
df['w_mean_lat'] = float(np.average(df[[col for col in df.columns if str(col).startswith('latitude_')]].to_numpy()[mask],
axis=0,
weights=1/abs(df[[col for col in df.columns if str(col).startswith('tmp_')]].to_numpy()[mask])))
# Calculate the weighted average of longitude
df['w_mean_lon'] = float(np.average(df[[col for col in df.columns if str(col).startswith('longitude_')]].to_numpy()[mask],
axis=0,
weights=1/abs(df[[col for col in df.columns if str(col).startswith('tmp_')]].to_numpy()[mask])))
# Fill missing values
values = {}
for col in df.columns:
if ('latitude' in col) or ('mean_lat' in col):
values[col] = -90
elif ('longitude' in col) or ('mean_lon' in col):
values[col] = -180
else:
values[col] = 0
df.drop(columns=['index'], inplace=True)
df.fillna(value=values, inplace=True)
return df