prep.py 24.2 KB
Newer Older
1
2
3
4
#
# Title: Data Preparation for LINK Configs and Transmissions
# Author: Fabian Kovac <ds191008@fhstp.ac.at>
# Team: University of Applied Sciences St. Pölten
5
6
# Version: 2.6
# Last changed: 2021-09-22
7
8
9
#

import sys
10
import gzip
11
12
import pathlib
import argparse
13
import datetime
14
15
16
17
18
19

import numpy as np
import pandas as pd


def parse_arguments() -> argparse.Namespace:
20
    """Parses provided commandline arguments for LINK
21
22
	
	Returns:
23
		args (argparse.Namespace): object with paths to provided files
24
25
26
	"""
    
    # create argument parser with description
27
    desc = '# Data Preparation for LINK\n'
28
29
30
31
32
33
    desc += '-'*64 + '\n'
    desc += 'Script outputs the same files with a "_clean" suffix.\n'
    desc += 'Existing clean versions are automatically overwritten!'
    
    parser = argparse.ArgumentParser(
        prog = 'prep.py',
34
        usage = 'python %(prog)s -c <config_file> -t <transmissions_file> -i <inca_dir>',
35
36
37
38
39
40
41
42
43
        description = desc,
        formatter_class = argparse.RawTextHelpFormatter
    )
    
    # add required argument group
    # add config parameter to parser
    # add transmissions parameter to parser
    required_args = parser.add_argument_group('required arguments')
    required_args.add_argument('-c', '--config', type = str, required = True, help = 'Path to Config-File')
Fabian Kovac's avatar
Fabian Kovac committed
44
    required_args.add_argument('-t', '--transmissions', type = str, required = True, help = 'Path to Transmissions-File')
45
    required_args.add_argument('-i', '--inca', type = str, required = True, help = 'Path to Inca-Dir')
46
47
48
49
50
51
52
53
    
    # parse arguments
    args = parser.parse_args()
    
    return args


def _log(msg: str) -> None:
54
    """Logs message
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    
    Parameters:
        msg (str): Message to log to console
	"""
    
    # add marker to log message
    marker = '%'
    if msg[:1] == '\n':
        msg = f'\n{marker} {msg[1:]}'
    else:
        msg = f'{marker} {msg}'
    
    # print message
    print(msg)


71
def load_inca_file(file_inca: pathlib.Path) -> np.array:
72
    """Loads gzipped INCA data of a given file
Fabian Kovac's avatar
Fabian Kovac committed
73
    
74
75
    Parameters:
       file_inca (pathlib.Path): Path to INCA file
Fabian Kovac's avatar
Fabian Kovac committed
76
    
77
78
79
80
81
82
83
    Returns:
        x (np.array): Matrix with shape (401, 701)
    """
    
    # open zipped file and bring data to right shape (resolution x,y: 701x401 km2)
    with gzip.open(file_inca, 'rb') as file:
        x = file.read()
84
        x = np.fromstring(x, dtype = np.float32, sep = ' ')
85
86
87
88
89
90
        x = np.reshape(x, (401, 701))

        return x


def load_inca_data(dir_inca: pathlib.Path) -> np.array:
91
    """Loads inca files of a given inca dir
Fabian Kovac's avatar
Fabian Kovac committed
92
    
93
94
    Parameters:
        dir_inca (pathlib.Path): Directory for INCA files
Fabian Kovac's avatar
Fabian Kovac committed
95
    
96
97
98
99
100
101
    Returns:
        inca_Data (np.array): Tensor with shape (96, 401, 701)
    """
    
    # initialize tensor with 96 15min intervals
    inca_data = np.zeros((96, 401, 701))
Fabian Kovac's avatar
Fabian Kovac committed
102
    
103
104
105
106
107
108
109
110
111
112
113
    # load inca dates from inca dir
    for i, file_inca in enumerate(sorted([file for file in dir_inca.iterdir() if file.is_file()])):        
        # load zipped ascii data
        data = load_inca_file(file_inca)
        
        # update inca tensor
        inca_data[i] = data
    
    return inca_data


114
def get_midpoint(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
Fabian Kovac's avatar
Fabian Kovac committed
115
    """Calculcates the midpoint between two wgs 84 coordinate pairs
Fabian Kovac's avatar
Fabian Kovac committed
116
    
117
118
119
120
121
    Parameters:
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
122
    
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
    Returns:
        lon, lat (np.array): Vectors with longitudes and latitudes of the midpoints
    """
    
    # convert degrees to radians
    lon_a, lat_a = np.radians(lon_a), np.radians(lat_a)
    lon_b, lat_b = np.radians(lon_b), np.radians(lat_b)
    
    # calculate midpoint
    Bx = np.cos(lat_b) * np.cos(lon_b - lon_a)
    By = np.cos(lat_b) * np.sin(lon_b - lon_a)
    
    lon_mid = lon_a + np.arctan2(By, np.cos(lat_a) + Bx)
    lat_mid = np.arctan2(
        np.sin(lat_a) + np.sin(lat_b),
        np.sqrt(np.square(np.cos(lat_a) + Bx) + np.square(By))
    )
    
    return np.degrees(lon_mid), np.degrees(lat_mid)


144
def get_distance(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
Fabian Kovac's avatar
Fabian Kovac committed
145
146
    """Calculcates distance between two coordinates in km using a WGS84 rotation-ellipsoid
    
147
    Parameters:
148
149
150
151
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
152
    
153
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
154
        length (np.array): Vector with distances in km
155
    """
Fabian Kovac's avatar
Fabian Kovac committed
156
157
    
    # constants (equator radius and pole radius in km)
158
159
160
    # r_equator is the 'semi-major axis' and r_pole the 'semi-minor axis' on a WGS84 ellipsoid
    r_equator = 6378.137
    r_pole = 6356.7523142
Fabian Kovac's avatar
Fabian Kovac committed
161
162
163
    
    # calculate points on rotation-ellipsoid
    # point A
164
165
166
167
    za = np.sin(np.radians(lat_a)) * r_pole
    ra = np.cos(np.radians(lat_a)) * r_equator
    xa = np.sin(np.radians(lon_a)) * ra
    ya = np.cos(np.radians(lon_a)) * ra
Fabian Kovac's avatar
Fabian Kovac committed
168
169
    
    # point B
170
171
172
173
    zb = np.sin(np.radians(lat_b)) * r_pole
    rb = np.cos(np.radians(lat_b)) * r_equator
    xb = np.sin(np.radians(lon_b)) * rb
    yb = np.cos(np.radians(lon_b)) * rb
Fabian Kovac's avatar
Fabian Kovac committed
174
175
    
    # calculate differences between A and B
176
177
178
    dx = xa - xb
    dy = ya - yb
    dz = za - zb
Fabian Kovac's avatar
Fabian Kovac committed
179
180
181
182
183
    
    # calculate distances between points using the square root of the sum of squares
    distance = np.sqrt(np.square(dx) + np.square(dy) + np.square(dz))
    	
    return distance
184
185


Fabian Kovac's avatar
Fabian Kovac committed
186
def wgs84_to_lambert(lon: np.array, lat: np.array) -> tuple:
Fabian Kovac's avatar
Fabian Kovac committed
187
188
    """Convert UTM coordinates to an Austrian Lambert Conic Conformal Projection (lcc)
    EPSG code: 31287, see https://epsg.io/31287
189
190
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
191
192
        lon (np.array): Vector containing wgs 84 longitudes
        lat (np.array): Vector containing wgs 84 latitudes
193
194
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
195
        x, y (np.array): Vectors containing lcc coordinates
196
197
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
198
    # convert wgs 84 coordinates to radians
199
200
    lon = np.radians(lon)
    lat = np.radians(lat)
Fabian Kovac's avatar
Fabian Kovac committed
201

202
203
204
205
206
207
208
209
210
211
    # define standard parallels according to EPSG:31287 - MGI/Austria Lambert
    lat1 = np.radians(49)
    lat2 = np.radians(46)
    
    # point of reference:
    #   lon: 13°20'E
    #   lat: 47°30'N
    lon0 = np.radians(13.33333333333333)
    lat0 = np.radians(47.5)
    
Fabian Kovac's avatar
Fabian Kovac committed
212
    # false easting/northing
213
    # INCA grid: 701x401 km
Fabian Kovac's avatar
Fabian Kovac committed
214
    # compensate for point of reference (middle of the grid)
Fabian Kovac's avatar
Fabian Kovac committed
215
216
217
    # move point of reference (0, 0) to following x and y coordinates (x0, y0) to e.g. get strictly positive coordinates
    # false easting: 3505000m
    # false northing: 200500m
218
219
220
221
222
223
    x0 = 350500
    y0 = 200500
    
    # volumetric mean radius of the earth in m
    R = 6371000

Fabian Kovac's avatar
Fabian Kovac committed
224
225
226
227
    # lambert conformal conic projection as in https://pubs.usgs.gov/pp/1395/report.pdf
    # reference:
    # >>    J. P. Snyder, Map projections--a working manual. Washington: U.S. G.P.O.,
    # >>    For sale by the Supt. of Docs., 1987., pp. 104-110
Fabian Kovac's avatar
Fabian Kovac committed
228
    n = np.log(np.cos(lat1) * (1 / np.cos(lat2))) / np.log(np.tan(np.pi/4 + lat2/2) * (1 / np.tan(np.pi/4 + lat1/2)))
229
    F = (np.cos(lat1) * np.tan(np.pi/4 + lat1/2)**n) / n
Fabian Kovac's avatar
Fabian Kovac committed
230
231
    p = R * F * (1 / np.tan(np.pi/4 + lat/2))**n
    p0 = R * F * (1 / np.tan(np.pi/4 + lat0/2))**n
232

Fabian Kovac's avatar
Fabian Kovac committed
233
    # calculate x and y in lcc
234
235
236
237
238
239
    x = p * np.sin(n * (lon - lon0)) + x0
    y = p0 - p * np.cos(n * (lon - lon0)) + y0

    return x, y


240
def lambert_to_inca_idx(x: np.array, y: np.array) -> tuple:
241
    """Convert x and y of Lambert Conic Conformal Projection to an INCA index
242
243
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
244
245
        x (np.array): Vector containing x values in meter of Lambert Conformal Conic Projection
        y (np.array): Vector containing y values in meter of in Lambert Conformal Conic Projection
246
247
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
248
        ix, iy (np.array): indices for INCA data
249
250
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
251
252
253
254
    # convert from m to km
    x /= 1000
    y /= 1000
    
255
256
257
    # convert coordinates to INCA index (nearest coordinates as int)
    ix = np.round(x, decimals = 0).astype(int)
    iy = np.round(y, decimals = 0).astype(int)
Fabian Kovac's avatar
Fabian Kovac committed
258
259
    
    return ix, iy
260
261


262
263
def get_inca_data(datetimes: np.array, lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array, lengths: np.array) -> np.array:
    """Get INCA RR data based for each km of link path
264
265
266
    
    Parameters:
        datetimes (np.array): Vector containing datetimes of transmissions
267
268
269
270
271
        lon_a (np.array): Vector containing longitude values of LINK site a
        lat_a (np.array): Vector containing latitude values of LINK site a
        lon_b (np.array): Vector containing longitude values of LINK site b
        lat_b (np.array): Vector containing latiotude values of LINK site b
        length (np.array): Vector containing distance between sites in km
272
273
    
    Returns:
274
        inca_RR (np.array): Vector containing INCA RR data for each transmission
275
276
	"""
    
277
278
279
    # load inca data
    inca_data = load_inca_data(dir_inca)
    _log(f'Loaded INCA data from {str(dir_inca).split("/")[-1]} with shape {inca_data.shape}')
280
281
282
283
284
    
    # generate times of day in 15min (window) intervals
    window = 15
    inca_times = sorted([str(i * datetime.timedelta(seconds = window))[2:] for i in range(24*60//window)])
    
Fabian Kovac's avatar
Fabian Kovac committed
285
286
287
    # generate times of LINK data (last 4 chars of datetime vector)
    time_func = np.vectorize(lambda x: f'{x[-4:-2]}:{x[-2:]}')
    link_times = time_func(datetimes)
288
    
289
    # get INCA indices of LINK times
290
    idx_times = np.searchsorted(inca_times, link_times)
291
    _log(f'Created search indices vector for each transmission for INCA RR tensor')
292
    
293
    
Fabian Kovac's avatar
Fabian Kovac committed
294
295
296
    # convert wgs 84 coordinates to lambert conic conformal projection
    xa, ya = wgs84_to_lambert(lon_a, lat_a)
    xb, yb = wgs84_to_lambert(lon_b, lat_b)
297
298
299
300
301
302
303
    
    # create lambert coordinate matrices for start and end points    
    start = np.array([xa, ya]).T
    end = np.array([xb, yb]).T
    
    # define n INCA RR points for each km of link length
    # --> min. points: 3 (start, mid and end) if length < 3km
304
    # --> get odd amount of RR points to always include the middle of the link
305
306
    n = np.round(lengths, decimals = 0).astype(int)
    n = np.where(n < 3, 3, n)
307
    n = np.where(n%2 == 0, n+1, n)
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
    
    # set list for INCA RR data
    inca_RR = []
    
    # get n INCA RR data points for each transmission
    # TODO: find vectorized solution without loop
    # --> n is dynamic (diffent lengths of links) and np.linspace() only supports n as a scalar
    for i in range(len(link_times)):        
        # create n points for each km of link
        # --> lambert projection is in meters, therefore linspace equally divides the link into n points (including start, end and mid)
        x, y = np.linspace(start[i], end[i], n[i], axis = 1)
        
        # get nearest inca coordinates for each point on link
        idx, idy = lambert_to_inca_idx(x, y)
        
        # get INCA RR data for each point on link
        inca_RR.append(inca_data[idx_times[i], idy, idx])
325
326
    
    return inca_RR
327
328
329


def prep() -> None:
Fabian Kovac's avatar
Fabian Kovac committed
330
    """Data preparation for LINK config and transmissions"""
331
332
333
    
    _log('\n******************************** READ FILES ********************************')
    
334
    # read config and transmission files
335
336
337
338
    df_config = pd.read_csv(file_config, sep = ';')
    _log(f'Read config file with shape {df_config.shape}')
    df_trans = pd.read_csv(file_trans, sep = ';')
    _log(f'Read transmissions file with shape {df_trans.shape}')
339
340
    
    
341
342
343
344
345
346
    _log('\n******************************** BASIC PREP ********************************')
    
    # remove test-link with link id 1
    df_config = df_config[df_config['LINKID'] != 1]
    df_trans = df_trans[df_trans['RADIOLINKID'] != 1]
    _log('Removed all entries of test-link with linkid 1')
347
348
    
    
349
350
351
352
    # drop links that are officially not in use ('na' in CAPACITYINTERFACE and/or FREQUENCY)
    # --> see Q&A Phillip Scheffknecht (05 Feb 2021)
    df_config = df_config.dropna(axis = 0, subset = ['CAPACITYINTERFACE', 'FREQUENCY'])
    _log('Dropped configs with NA in CAPACITYINTERFACE and/or FREQUENCY (links officially not in use)')
353
354
    
    
355
356
    # delete rows with unused link ids
    # get link ids of config and transmissions
357
358
    config_ids = set(df_config['LINKID'].unique().tolist())
    trans_ids = set(df_trans['RADIOLINKID'].unique().tolist())
359
    
360
    # delete link ids in transmissions without config
361
    unused_trans_ids = trans_ids - config_ids
362
363
    df_trans = df_trans[~df_trans['RADIOLINKID'].isin(list(unused_trans_ids))]
    _log('Removed all links in transmissions where no config is present')
364
    
365
    # delete link ids in config without transmissions
366
    unused_config_ids = config_ids - trans_ids
367
368
    df_config = df_config[~df_config['LINKID'].isin(list(unused_config_ids))]
    _log('Removed all links in config where no transmission is present')
369
370
    
    
371
    # delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
372
373
    # gather duplicated rows in config file
    col_subset = ['LINKTYPE', 'SITEID_A', 'LATITUDE_A', 'LONGITUDE_A', 'SITEID_B', 'LATITUDE_B', 'LONGITUDE_B', 'CAPACITYINTERFACE', 'FREQUENCY']
374
    duplicated_config_ids = set(df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist())
375
    
376
    # gather duplicated link ids of config file in transmissions file
377
    found_trans_ids = set(df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist())
378
    
379
    # calculate unused duplicated ids in config file
380
    duplicated_used_ids = duplicated_config_ids - found_trans_ids
381
    
382
    # delete rows with unused duplicated link ids in config file
383
    df_config = df_config[~df_config['LINKID'].isin(list(duplicated_used_ids))]
384
385
    _log('Removed duplicated links which are not in use')
    
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
    
    # check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B) pairs
    # gather unique combinations of link ids and site ids of config
    # (new temporary column is much faster then pandas agg function)
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_A']
    config_loc_tuples = set(df_config['TEMP_LOC_TUPLE'].unique().tolist())
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_B']
    config_loc_tuples.update(set(df_config['TEMP_LOC_TUPLE'].unique().tolist()))
    
    # gather unique combinations of link ids and locations in transmissions
    # (new temporary column is much faster then pandas agg function)
    df_trans['TEMP_LOC_TUPLE'] = df_trans['RADIOLINKID'].astype(str) + '--' + df_trans['LOCATION']
    trans_loc_tuples = set(df_trans['TEMP_LOC_TUPLE'].unique().tolist())
    
    # calculate link id - location tuples which are not in config
    invalid_loc_tuples = trans_loc_tuples - config_loc_tuples
    
    # remove invalid tuples from transmissions
    df_trans = df_trans[~df_trans['TEMP_LOC_TUPLE'].isin(list(invalid_loc_tuples))]
    _log('Removed all transmissions with invalid RADIOLINKID - LOCATION tuples (not present in config)')
    
    # remove temp columns
    df_config = df_config.drop(['TEMP_LOC_TUPLE'], axis = 1)
    df_trans = df_trans.drop(['TEMP_LOC_TUPLE'], axis = 1)
    
411
    
412
    # calculate midpoint of links
Fabian Kovac's avatar
Fabian Kovac committed
413
414
415
416
    df_config['LONGITUDE_MID'], df_config['LATITUDE_MID'] = get_midpoint(
        df_config['LONGITUDE_A'], df_config['LATITUDE_A'],
        df_config['LONGITUDE_B'], df_config['LATITUDE_B']
    )
417
418
    _log('Calculated midpoint of links')
    
419
    # calculate LENGTH in km between links
420
    df_config['LENGTH'] = get_distance(df_config['LONGITUDE_A'], df_config['LATITUDE_A'], df_config['LONGITUDE_B'], df_config['LATITUDE_B'])
421
    _log('Calculated distances between sites using a WGS84 ellipsoid')
422
423
    
    
Fabian Kovac's avatar
Fabian Kovac committed
424
425
426
427
    # convert CAPACITYINTERFACE to int
    df_config['CAPACITYINTERFACE'] = df_config['CAPACITYINTERFACE'].map(lambda x: str(x)[:-5]).astype('int')
    _log('Converted CAPACITYINTERFACE to int')
    
428
429
430
431
    # convert FREQUENCY to float
    df_config['FREQUENCY'] = df_config['FREQUENCY'].map(lambda x: str(x)[:-3]).astype('float')
    _log('Converted FREQUENCY to float')
    
432
433
    # convert RXFREQUENCY and TXFREQUENCY to float and from MHz to GHz
    # check if columns exists (only present with 2021-05)
434
435
436
    if 'TXFREQUENCY' in df_config.columns and 'RXFREQUENCY' in df_config.columns:
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY'].astype('float')
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY']/1000
437
438
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY'].astype('float')
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY']/1000
439
        _log('Converted RXFREQUENCY and TXFREQUENCY to float and GHz')
440
    
441
442
    
    # drop transmissions with (operational) status unequal 1
443
444
445
446
447
448
449
450
451
452
    df_trans = df_trans[df_trans['STATUS'] == 1]
    df_trans = df_trans[df_trans['OPERATIONALSTATUS'] == 1]
    _log('Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1')
    
    
    _log('\n******************************** BUILD LINK DF *****************************')
    
    # copy transmissions dataframe to link dataframe
    df_link = df_trans.copy()
    _log('Copy transmissions dataframe to link dataframe')
453
454
    
    
455
    # convert begintime to utc
456
457
458
    df_link['BEGINTIME'] = pd.to_datetime(df_link['BEGINTIME'], format = '%Y-%m-%d %H:%M:%S')
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.tz_localize('Europe/Vienna').dt.tz_convert('UTC').dt.tz_localize(None)
    _log('Converted BEGINTIME to UTC')
459
    
Fabian Kovac's avatar
Fabian Kovac committed
460
    # fix 15min timelag in transmissions
461
462
463
    df_link['BEGINTIME'] = df_link['BEGINTIME'] - pd.Timedelta(15, unit = 'min')
    _log('Fixed 15min timelag in transmissions')
    
Fabian Kovac's avatar
Fabian Kovac committed
464
465
466
467
468
    # only use transmissions with begintime matching INCA date
    date_inca = np.datetime64(f'{str(dir_inca.stem)[0:4]}-{str(dir_inca.stem)[-4:-2]}-{str(dir_inca.stem)[-2:]}')
    df_link = df_link[df_link['BEGINTIME'].dt.date == date_inca]
    _log('Filtered transmissions to match BEGINTIME with INCA date')
    
469
    
470
471
472
473
    # copy REMOTERXLEVEL to PMIN and PMAX (for aggregation in 15min window conversion)
    df_link['PMIN'] = df_link['REMOTERXLEVEL']
    df_link['PMAX'] = df_link['REMOTERXLEVEL']
    _log('Created PMIN and PMAX of REMOTERXLEVEL')
474
    
475
    # convert 3min windows to 15min windows
Fabian Kovac's avatar
Fabian Kovac committed
476
    group_cols = [df_link['BEGINTIME'].dt.round('15Min'), 'RADIOLINKID']
477
478
479
    agg_cols = {
        'PMIN' : 'min',
        'PMAX' : 'max',
480
481
        'REMOTERXLEVEL' : 'mean',
        'TXLEVEL' : 'mean',
482
        'SPEED' : 'mean',
483
484
        'CURRRXBITRATE' : 'mean',
        'CURRTXBITRATE' : 'mean'
485
    }
486
487
488
489
490
491
    df_link = df_link.groupby(group_cols).agg(agg_cols).reset_index()
    _log('Converted 3min windows to 15min windows')
    
    # convert BEGINTIME to RAINLINK format
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.strftime('%Y%m%d%H%M')
    _log('Converted BEGINTIME to RAINLINK format "%Y%m%d%H%M"')
492
493
494
495
496
497
498
499
500
    
    
    # merge config and link dataframe
    drop_cols = ['RADIOLINKID', 'LINKTYPE', 'SITEID_A', 'SITEID_B', 'CAPACITYINTERFACE']
    df_link = pd.merge(df_link, df_config, how = 'inner', left_on = 'RADIOLINKID', right_on = 'LINKID').drop(drop_cols, axis = 1)
    _log('Merged config data to link dataframe')
    
    
    _log('\n******************************** CALC POWER LEVELS *************************')
501
    
502
    # build df with differences of sending and receiving levels
503
    df_diff = df_link[['LINKID', 'TXLEVEL', 'REMOTERXLEVEL']].copy()
504
    df_diff['PINSTMEAN'] = df_diff['TXLEVEL'] - df_diff['REMOTERXLEVEL']
505
506
    _log('Built dataframe with mean link difference levels of TXLEVEL and REMOTERXLEVEL')
    
507
508
    # get mean of differences per link
    df_diff = df_diff.groupby(['LINKID']).agg({'PINSTMEAN' : 'mean'}).reset_index()
509
    
510
    
511
    # merge differences to transmission dataframe
512
513
514
    df_link = pd.merge(df_link, df_diff, how = 'left', left_on = 'LINKID', right_on = 'LINKID')
    _log('Merged mean link difference levels back to link dataframe')
    
515
516
517
518
519
520
521
    
    # calculate instantaneous values from transmitted and received signal strength
    df_link['PINST'] = df_link['TXLEVEL'] - df_link['REMOTERXLEVEL']
    _log('Calculated instantaneous values PINST from transmitted and received signal strength')
    
    # calculate differences from instantaneous values and mean link diff. levels
    df_link['PDIFF'] = df_link['PINST'] - df_link['PINSTMEAN']
522
    _log('Calculated difference levels PDIFF instantaneous values derivating of daily means of dry periods')
523
524
    
    
525
526
527
528
529
    # calculate attenuation of transmitted and received signal strengths per transmission
    df_link['ATTENUATION'] = df_link['REMOTERXLEVEL'] - df_link['TXLEVEL']
    _log('Calculated attenuation per transmission')
    
    # calculate extinction coefficients for links based on attenuation and lengths of the links based on beer-lambert law
530
531
    # Coeff = -ln(10) * (Attenuation/PathLength)
    df_link['EXTINCTIONCOEFF'] = -np.log(10) * (df_link['ATTENUATION'] / df_link['LENGTH'])
532
533
534
    _log('Calculated extinction coefficients based on beer-lambert law')
    
    
535
536
    # rename and reorder columns to aid RAINLINK format
    name_cols = {
537
        'LINKID' : 'ID',
538
539
540
        'BEGINTIME' : 'DateTime',
        'PMIN' : 'Pmin',
        'PMAX' : 'Pmax',
541
        'REMOTERXLEVEL' : 'Pmean',
542
543
544
        'PINST' : 'Pinst',
        'PINSTMEAN' : 'PinstMean',
        'PDIFF' : 'Pdiff',
545
        'TXLEVEL' : 'TxLevel',
546
547
        'ATTENUATION' : 'Attenuation',
        'EXTINCTIONCOEFFICIENT' : 'ExtinctionCoeff',
548
549
        'SPEED' : 'Speed',
        'CURRRXBITRATE' : 'RxBitrate',
550
        'CURRTXBITRATE' : 'TxBitrate',
551
552
        'LONGITUDE_A' : 'XStart',
        'LATITUDE_A' : 'YStart',
553
554
        'LONGITUDE_MID' : 'XMid',
        'LATITUDE_MID' : 'YMid',
555
556
        'LONGITUDE_B' : 'XEnd',
        'LATITUDE_B' : 'YEnd',
557
558
        'LENGTH' : 'PathLength',
        'FREQUENCY' : 'Frequency',
559
    }
560
561
    
    # check if RXFREQUENCY and TXFREQUENCY exists (only present with 2021-05)
562
    if 'TXFREQUENCY' in df_link.columns and 'RXFREQUENCY' in df_link.columns:
563
        name_cols.update({
564
565
            'RXFREQUENCY' : 'RxFrequency',
            'TXFREQUENCY' : 'TxFrequency'
566
567
        })
    
568
569
570
571
    df_link = df_link.rename(columns = name_cols).reindex(columns = list(name_cols.values()))
    _log('Converted link dataframe to RAINLINK format')
    
    
572
573
574
575
576
577
578
579
580
581
582
583
584
585
    _log('\n******************************** MERGE INCA ********************************')
    
    # set INCA RR data for each km of link path
    df_link['RRpath'] = get_inca_data(
        df_link['DateTime'],
        df_link['XStart'],
        df_link['YStart'],
        df_link['XEnd'],
        df_link['YEnd'],
        df_link['PathLength']
     )
    _log('Merged INCA RR data to transmissions')
    
    
586
587
588
589
590
591
592
    _log('\n******************************** SAVE FILES ********************************')
    
    # build path for clean config and transmissions destination files
    dest_config = file_config.with_name(f'{file_config.stem}_clean{file_config.suffix}')
    dest_trans = file_trans.with_name(f'{file_trans.stem}_clean{file_trans.suffix}')
    
    # build path for clean link destination file (same folder, date and extension as transmissions file)
593
    date = str(file_trans.stem).split('_')[-1]
594
    dest_link = pathlib.Path(dest_trans.parents[0], f'LINK_{date}_clean{file_trans.suffix}')
595
596
597
    
    
    # save clean files
598
599
600
601
602
603
604
605
606
    df_config.to_csv(dest_config, sep = ';', header = True, index = False)
    _log(f'Saved clean config file with shape {df_config.shape} to "{str(dest_config)}"')
    df_trans.to_csv(dest_trans, sep = ';', header = True, index = False)
    _log(f'Saved clean transmissions file with shape {df_trans.shape} to "{str(dest_trans)}"')
    df_link.to_csv(dest_link, sep = ';', header = True, index = False)
    _log(f'Saved clean link file with shape {df_link.shape} to "{str(dest_link)}"')


if __name__ == '__main__':
607
608
609
    # flag for data prep
    start_prep = True
    
610
611
612
    # get config and transmissions file from arguments
    args = parse_arguments()
    
613
614
615
    # convert config and transmissions arguments to paths
    file_config = pathlib.Path(args.config)
    file_trans = pathlib.Path(args.transmissions)
616
    dir_inca = pathlib.Path(args.inca)
617
618
619
620
621
622
623
624
    
    # check if config files exists
    if not file_config.exists():
        _log('Invalid path for config file!')
        start_prep = False
        
    # check if transmissions file exists
    if not file_trans.exists():
625
626
627
628
629
630
        _log('Invalid path for transmissions file!')
        start_prep = False
    
    # chec if inca dir exists
    if not dir_inca.exists():
        _log('Invalid path for inca directory!')
631
632
633
634
        start_prep = False
    
    # start prep if flag is True, otherwise exit with code 2
    if start_prep:
635
        prep()
636
    else:
637
        sys.exit(2)