prep.py 24.1 KB
Newer Older
1
2
3
4
#
# Title: Data Preparation for LINK Configs and Transmissions
# Author: Fabian Kovac <ds191008@fhstp.ac.at>
# Team: University of Applied Sciences St. Pölten
5
6
# Version: 2.6
# Last changed: 2021-09-22
7
8
9
#

import sys
10
import gzip
11
12
import pathlib
import argparse
13
import datetime
14
15
16
17
18
19

import numpy as np
import pandas as pd


def parse_arguments() -> argparse.Namespace:
20
    """Parses provided commandline arguments for LINK
21
22
	
	Returns:
23
		args (argparse.Namespace): object with paths to provided files
24
25
26
	"""
    
    # create argument parser with description
27
    desc = '# Data Preparation for LINK\n'
28
29
30
31
32
33
    desc += '-'*64 + '\n'
    desc += 'Script outputs the same files with a "_clean" suffix.\n'
    desc += 'Existing clean versions are automatically overwritten!'
    
    parser = argparse.ArgumentParser(
        prog = 'prep.py',
34
        usage = 'python %(prog)s -c <config_file> -t <transmissions_file> -i <inca_dir>',
35
36
37
38
39
40
41
42
43
        description = desc,
        formatter_class = argparse.RawTextHelpFormatter
    )
    
    # add required argument group
    # add config parameter to parser
    # add transmissions parameter to parser
    required_args = parser.add_argument_group('required arguments')
    required_args.add_argument('-c', '--config', type = str, required = True, help = 'Path to Config-File')
Fabian Kovac's avatar
Fabian Kovac committed
44
    required_args.add_argument('-t', '--transmissions', type = str, required = True, help = 'Path to Transmissions-File')
45
    required_args.add_argument('-i', '--inca', type = str, required = True, help = 'Path to Inca-Dir')
46
47
48
49
50
51
52
53
    
    # parse arguments
    args = parser.parse_args()
    
    return args


def _log(msg: str) -> None:
54
    """Logs message
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    
    Parameters:
        msg (str): Message to log to console
	"""
    
    # add marker to log message
    marker = '%'
    if msg[:1] == '\n':
        msg = f'\n{marker} {msg[1:]}'
    else:
        msg = f'{marker} {msg}'
    
    # print message
    print(msg)


71
def load_inca_file(file_inca: pathlib.Path) -> np.array:
72
    """Loads gzipped INCA data of a given file
Fabian Kovac's avatar
Fabian Kovac committed
73
    
74
75
    Parameters:
       file_inca (pathlib.Path): Path to INCA file
Fabian Kovac's avatar
Fabian Kovac committed
76
    
77
78
79
80
81
82
83
    Returns:
        x (np.array): Matrix with shape (401, 701)
    """
    
    # open zipped file and bring data to right shape (resolution x,y: 701x401 km2)
    with gzip.open(file_inca, 'rb') as file:
        x = file.read()
84
        x = np.fromstring(x, dtype = np.float32, sep = ' ')
85
86
87
88
89
90
        x = np.reshape(x, (401, 701))

        return x


def load_inca_data(dir_inca: pathlib.Path) -> np.array:
91
    """Loads inca files of a given inca dir
Fabian Kovac's avatar
Fabian Kovac committed
92
    
93
94
    Parameters:
        dir_inca (pathlib.Path): Directory for INCA files
Fabian Kovac's avatar
Fabian Kovac committed
95
    
96
97
98
99
100
101
    Returns:
        inca_Data (np.array): Tensor with shape (96, 401, 701)
    """
    
    # initialize tensor with 96 15min intervals
    inca_data = np.zeros((96, 401, 701))
Fabian Kovac's avatar
Fabian Kovac committed
102
    
103
104
105
106
107
108
109
110
111
112
113
    # load inca dates from inca dir
    for i, file_inca in enumerate(sorted([file for file in dir_inca.iterdir() if file.is_file()])):        
        # load zipped ascii data
        data = load_inca_file(file_inca)
        
        # update inca tensor
        inca_data[i] = data
    
    return inca_data


114
def get_midpoint(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
Fabian Kovac's avatar
Fabian Kovac committed
115
    """Calculcates the midpoint between two wgs 84 coordinate pairs
Fabian Kovac's avatar
Fabian Kovac committed
116
    
117
118
119
120
121
    Parameters:
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
122
    
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
    Returns:
        lon, lat (np.array): Vectors with longitudes and latitudes of the midpoints
    """
    
    # convert degrees to radians
    lon_a, lat_a = np.radians(lon_a), np.radians(lat_a)
    lon_b, lat_b = np.radians(lon_b), np.radians(lat_b)
    
    # calculate midpoint
    Bx = np.cos(lat_b) * np.cos(lon_b - lon_a)
    By = np.cos(lat_b) * np.sin(lon_b - lon_a)
    
    lon_mid = lon_a + np.arctan2(By, np.cos(lat_a) + Bx)
    lat_mid = np.arctan2(
        np.sin(lat_a) + np.sin(lat_b),
        np.sqrt(np.square(np.cos(lat_a) + Bx) + np.square(By))
    )
    
    return np.degrees(lon_mid), np.degrees(lat_mid)


144
def get_distance(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
Fabian Kovac's avatar
Fabian Kovac committed
145
146
    """Calculcates distance between two coordinates in km using a WGS84 rotation-ellipsoid
    
147
    Parameters:
148
149
150
151
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
152
    
153
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
154
        length (np.array): Vector with distances in km
155
    """
Fabian Kovac's avatar
Fabian Kovac committed
156
157
    
    # constants (equator radius and pole radius in km)
158
159
160
    # r_equator is the 'semi-major axis' and r_pole the 'semi-minor axis' on a WGS84 ellipsoid
    r_equator = 6378.137
    r_pole = 6356.7523142
Fabian Kovac's avatar
Fabian Kovac committed
161
162
163
    
    # calculate points on rotation-ellipsoid
    # point A
164
165
166
167
    za = np.sin(np.radians(lat_a)) * r_pole
    ra = np.cos(np.radians(lat_a)) * r_equator
    xa = np.sin(np.radians(lon_a)) * ra
    ya = np.cos(np.radians(lon_a)) * ra
Fabian Kovac's avatar
Fabian Kovac committed
168
169
    
    # point B
170
171
172
173
    zb = np.sin(np.radians(lat_b)) * r_pole
    rb = np.cos(np.radians(lat_b)) * r_equator
    xb = np.sin(np.radians(lon_b)) * rb
    yb = np.cos(np.radians(lon_b)) * rb
Fabian Kovac's avatar
Fabian Kovac committed
174
175
    
    # calculate differences between A and B
176
177
178
    dx = xa - xb
    dy = ya - yb
    dz = za - zb
Fabian Kovac's avatar
Fabian Kovac committed
179
180
181
182
183
    
    # calculate distances between points using the square root of the sum of squares
    distance = np.sqrt(np.square(dx) + np.square(dy) + np.square(dz))
    	
    return distance
184
185


Fabian Kovac's avatar
Fabian Kovac committed
186
def wgs84_to_lambert(lon: np.array, lat: np.array) -> tuple:
Fabian Kovac's avatar
Fabian Kovac committed
187
188
    """Convert UTM coordinates to an Austrian Lambert Conic Conformal Projection (lcc)
    EPSG code: 31287, see https://epsg.io/31287
189
190
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
191
192
        lon (np.array): Vector containing wgs 84 longitudes
        lat (np.array): Vector containing wgs 84 latitudes
193
194
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
195
        x, y (np.array): Vectors containing lcc coordinates
196
197
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
198
    # convert wgs 84 coordinates to radians
199
200
    lon = np.radians(lon)
    lat = np.radians(lat)
Fabian Kovac's avatar
Fabian Kovac committed
201

202
203
204
205
206
207
208
209
210
211
    # define standard parallels according to EPSG:31287 - MGI/Austria Lambert
    lat1 = np.radians(49)
    lat2 = np.radians(46)
    
    # point of reference:
    #   lon: 13°20'E
    #   lat: 47°30'N
    lon0 = np.radians(13.33333333333333)
    lat0 = np.radians(47.5)
    
Fabian Kovac's avatar
Fabian Kovac committed
212
    # false easting/northing
213
    # INCA grid: 701x401 km
Fabian Kovac's avatar
Fabian Kovac committed
214
    # compensate for point of reference (middle of the grid)
Fabian Kovac's avatar
Fabian Kovac committed
215
216
217
    # move point of reference (0, 0) to following x and y coordinates (x0, y0) to e.g. get strictly positive coordinates
    # false easting: 3505000m
    # false northing: 200500m
218
219
220
221
222
223
    x0 = 350500
    y0 = 200500
    
    # volumetric mean radius of the earth in m
    R = 6371000

Fabian Kovac's avatar
Fabian Kovac committed
224
225
226
227
    # lambert conformal conic projection as in https://pubs.usgs.gov/pp/1395/report.pdf
    # reference:
    # >>    J. P. Snyder, Map projections--a working manual. Washington: U.S. G.P.O.,
    # >>    For sale by the Supt. of Docs., 1987., pp. 104-110
Fabian Kovac's avatar
Fabian Kovac committed
228
    n = np.log(np.cos(lat1) * (1 / np.cos(lat2))) / np.log(np.tan(np.pi/4 + lat2/2) * (1 / np.tan(np.pi/4 + lat1/2)))
229
    F = (np.cos(lat1) * np.tan(np.pi/4 + lat1/2)**n) / n
Fabian Kovac's avatar
Fabian Kovac committed
230
231
    p = R * F * (1 / np.tan(np.pi/4 + lat/2))**n
    p0 = R * F * (1 / np.tan(np.pi/4 + lat0/2))**n
232

Fabian Kovac's avatar
Fabian Kovac committed
233
    # calculate x and y in lcc
234
235
236
237
238
239
    x = p * np.sin(n * (lon - lon0)) + x0
    y = p0 - p * np.cos(n * (lon - lon0)) + y0

    return x, y


240
def lambert_to_inca_idx(x: np.array, y: np.array) -> tuple:
241
    """Convert x and y of Lambert Conic Conformal Projection to an INCA index
242
243
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
244
245
        x (np.array): Vector containing x values in meter of Lambert Conformal Conic Projection
        y (np.array): Vector containing y values in meter of in Lambert Conformal Conic Projection
246
247
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
248
        ix, iy (np.array): indices for INCA data
249
250
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
251
252
253
254
    # convert from m to km
    x /= 1000
    y /= 1000
    
255
256
257
    # convert coordinates to INCA index (nearest coordinates as int)
    ix = np.round(x, decimals = 0).astype(int)
    iy = np.round(y, decimals = 0).astype(int)
Fabian Kovac's avatar
Fabian Kovac committed
258
259
    
    return ix, iy
260
261


262
263
def get_inca_data(datetimes: np.array, lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array, lengths: np.array) -> np.array:
    """Get INCA RR data based for each km of link path
264
265
266
    
    Parameters:
        datetimes (np.array): Vector containing datetimes of transmissions
267
268
269
270
271
        lon_a (np.array): Vector containing longitude values of LINK site a
        lat_a (np.array): Vector containing latitude values of LINK site a
        lon_b (np.array): Vector containing longitude values of LINK site b
        lat_b (np.array): Vector containing latiotude values of LINK site b
        length (np.array): Vector containing distance between sites in km
272
273
    
    Returns:
274
        inca_RR (np.array): Vector containing INCA RR data for each transmission
275
276
	"""
    
277
278
279
    # load inca data
    inca_data = load_inca_data(dir_inca)
    _log(f'Loaded INCA data from {str(dir_inca).split("/")[-1]} with shape {inca_data.shape}')
280
281
282
283
284
    
    # generate times of day in 15min (window) intervals
    window = 15
    inca_times = sorted([str(i * datetime.timedelta(seconds = window))[2:] for i in range(24*60//window)])
    
Fabian Kovac's avatar
Fabian Kovac committed
285
286
287
    # generate times of LINK data (last 4 chars of datetime vector)
    time_func = np.vectorize(lambda x: f'{x[-4:-2]}:{x[-2:]}')
    link_times = time_func(datetimes)
288
    
289
    # get INCA indices of LINK times
290
    idx_times = np.searchsorted(inca_times, link_times)
291
    _log(f'Created search indices vector for each transmission for INCA RR tensor')
292
    
293
    
Fabian Kovac's avatar
Fabian Kovac committed
294
295
296
    # convert wgs 84 coordinates to lambert conic conformal projection
    xa, ya = wgs84_to_lambert(lon_a, lat_a)
    xb, yb = wgs84_to_lambert(lon_b, lat_b)
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
    
    # create lambert coordinate matrices for start and end points    
    start = np.array([xa, ya]).T
    end = np.array([xb, yb]).T
    
    # define n INCA RR points for each km of link length
    # --> min. points: 3 (start, mid and end) if length < 3km
    n = np.round(lengths, decimals = 0).astype(int)
    n = np.where(n < 3, 3, n)
    
    # set list for INCA RR data
    inca_RR = []
    
    # get n INCA RR data points for each transmission
    # TODO: find vectorized solution without loop
    # --> n is dynamic (diffent lengths of links) and np.linspace() only supports n as a scalar
    for i in range(len(link_times)):        
        # create n points for each km of link
        # --> lambert projection is in meters, therefore linspace equally divides the link into n points (including start, end and mid)
        x, y = np.linspace(start[i], end[i], n[i], axis = 1)
        
        # get nearest inca coordinates for each point on link
        idx, idy = lambert_to_inca_idx(x, y)
        
        # get INCA RR data for each point on link
        inca_RR.append(inca_data[idx_times[i], idy, idx])
323
324
    
    return inca_RR
325
326
327


def prep() -> None:
Fabian Kovac's avatar
Fabian Kovac committed
328
    """Data preparation for LINK config and transmissions"""
329
330
331
    
    _log('\n******************************** READ FILES ********************************')
    
332
    # read config and transmission files
333
334
335
336
    df_config = pd.read_csv(file_config, sep = ';')
    _log(f'Read config file with shape {df_config.shape}')
    df_trans = pd.read_csv(file_trans, sep = ';')
    _log(f'Read transmissions file with shape {df_trans.shape}')
337
338
    
    
339
340
341
342
343
344
    _log('\n******************************** BASIC PREP ********************************')
    
    # remove test-link with link id 1
    df_config = df_config[df_config['LINKID'] != 1]
    df_trans = df_trans[df_trans['RADIOLINKID'] != 1]
    _log('Removed all entries of test-link with linkid 1')
345
346
    
    
347
348
349
350
    # drop links that are officially not in use ('na' in CAPACITYINTERFACE and/or FREQUENCY)
    # --> see Q&A Phillip Scheffknecht (05 Feb 2021)
    df_config = df_config.dropna(axis = 0, subset = ['CAPACITYINTERFACE', 'FREQUENCY'])
    _log('Dropped configs with NA in CAPACITYINTERFACE and/or FREQUENCY (links officially not in use)')
351
352
    
    
353
354
    # delete rows with unused link ids
    # get link ids of config and transmissions
355
356
    config_ids = set(df_config['LINKID'].unique().tolist())
    trans_ids = set(df_trans['RADIOLINKID'].unique().tolist())
357
    
358
    # delete link ids in transmissions without config
359
    unused_trans_ids = trans_ids - config_ids
360
361
    df_trans = df_trans[~df_trans['RADIOLINKID'].isin(list(unused_trans_ids))]
    _log('Removed all links in transmissions where no config is present')
362
    
363
    # delete link ids in config without transmissions
364
    unused_config_ids = config_ids - trans_ids
365
366
    df_config = df_config[~df_config['LINKID'].isin(list(unused_config_ids))]
    _log('Removed all links in config where no transmission is present')
367
368
    
    
369
    # delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
370
371
    # gather duplicated rows in config file
    col_subset = ['LINKTYPE', 'SITEID_A', 'LATITUDE_A', 'LONGITUDE_A', 'SITEID_B', 'LATITUDE_B', 'LONGITUDE_B', 'CAPACITYINTERFACE', 'FREQUENCY']
372
    duplicated_config_ids = set(df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist())
373
    
374
    # gather duplicated link ids of config file in transmissions file
375
    found_trans_ids = set(df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist())
376
    
377
    # calculate unused duplicated ids in config file
378
    duplicated_used_ids = duplicated_config_ids - found_trans_ids
379
    
380
    # delete rows with unused duplicated link ids in config file
381
    df_config = df_config[~df_config['LINKID'].isin(list(duplicated_used_ids))]
382
383
    _log('Removed duplicated links which are not in use')
    
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
    
    # check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B) pairs
    # gather unique combinations of link ids and site ids of config
    # (new temporary column is much faster then pandas agg function)
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_A']
    config_loc_tuples = set(df_config['TEMP_LOC_TUPLE'].unique().tolist())
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_B']
    config_loc_tuples.update(set(df_config['TEMP_LOC_TUPLE'].unique().tolist()))
    
    # gather unique combinations of link ids and locations in transmissions
    # (new temporary column is much faster then pandas agg function)
    df_trans['TEMP_LOC_TUPLE'] = df_trans['RADIOLINKID'].astype(str) + '--' + df_trans['LOCATION']
    trans_loc_tuples = set(df_trans['TEMP_LOC_TUPLE'].unique().tolist())
    
    # calculate link id - location tuples which are not in config
    invalid_loc_tuples = trans_loc_tuples - config_loc_tuples
    
    # remove invalid tuples from transmissions
    df_trans = df_trans[~df_trans['TEMP_LOC_TUPLE'].isin(list(invalid_loc_tuples))]
    _log('Removed all transmissions with invalid RADIOLINKID - LOCATION tuples (not present in config)')
    
    # remove temp columns
    df_config = df_config.drop(['TEMP_LOC_TUPLE'], axis = 1)
    df_trans = df_trans.drop(['TEMP_LOC_TUPLE'], axis = 1)
    
409
    
410
    # calculate midpoint of links
Fabian Kovac's avatar
Fabian Kovac committed
411
412
413
414
    df_config['LONGITUDE_MID'], df_config['LATITUDE_MID'] = get_midpoint(
        df_config['LONGITUDE_A'], df_config['LATITUDE_A'],
        df_config['LONGITUDE_B'], df_config['LATITUDE_B']
    )
415
416
    _log('Calculated midpoint of links')
    
417
    # calculate LENGTH in km between links
418
    df_config['LENGTH'] = get_distance(df_config['LONGITUDE_A'], df_config['LATITUDE_A'], df_config['LONGITUDE_B'], df_config['LATITUDE_B'])
419
    _log('Calculated distances between sites using a WGS84 ellipsoid')
420
421
    
    
Fabian Kovac's avatar
Fabian Kovac committed
422
423
424
425
    # convert CAPACITYINTERFACE to int
    df_config['CAPACITYINTERFACE'] = df_config['CAPACITYINTERFACE'].map(lambda x: str(x)[:-5]).astype('int')
    _log('Converted CAPACITYINTERFACE to int')
    
426
427
428
429
    # convert FREQUENCY to float
    df_config['FREQUENCY'] = df_config['FREQUENCY'].map(lambda x: str(x)[:-3]).astype('float')
    _log('Converted FREQUENCY to float')
    
430
431
    # convert RXFREQUENCY and TXFREQUENCY to float and from MHz to GHz
    # check if columns exists (only present with 2021-05)
432
433
434
    if 'TXFREQUENCY' in df_config.columns and 'RXFREQUENCY' in df_config.columns:
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY'].astype('float')
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY']/1000
435
436
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY'].astype('float')
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY']/1000
437
        _log('Converted RXFREQUENCY and TXFREQUENCY to float and GHz')
438
    
439
440
    
    # drop transmissions with (operational) status unequal 1
441
442
443
444
445
446
447
448
449
450
    df_trans = df_trans[df_trans['STATUS'] == 1]
    df_trans = df_trans[df_trans['OPERATIONALSTATUS'] == 1]
    _log('Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1')
    
    
    _log('\n******************************** BUILD LINK DF *****************************')
    
    # copy transmissions dataframe to link dataframe
    df_link = df_trans.copy()
    _log('Copy transmissions dataframe to link dataframe')
451
452
    
    
453
    # convert begintime to utc
454
455
456
    df_link['BEGINTIME'] = pd.to_datetime(df_link['BEGINTIME'], format = '%Y-%m-%d %H:%M:%S')
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.tz_localize('Europe/Vienna').dt.tz_convert('UTC').dt.tz_localize(None)
    _log('Converted BEGINTIME to UTC')
457
    
Fabian Kovac's avatar
Fabian Kovac committed
458
    # fix 15min timelag in transmissions
459
460
461
    df_link['BEGINTIME'] = df_link['BEGINTIME'] - pd.Timedelta(15, unit = 'min')
    _log('Fixed 15min timelag in transmissions')
    
Fabian Kovac's avatar
Fabian Kovac committed
462
463
464
465
466
    # only use transmissions with begintime matching INCA date
    date_inca = np.datetime64(f'{str(dir_inca.stem)[0:4]}-{str(dir_inca.stem)[-4:-2]}-{str(dir_inca.stem)[-2:]}')
    df_link = df_link[df_link['BEGINTIME'].dt.date == date_inca]
    _log('Filtered transmissions to match BEGINTIME with INCA date')
    
467
    
468
469
470
471
    # copy REMOTERXLEVEL to PMIN and PMAX (for aggregation in 15min window conversion)
    df_link['PMIN'] = df_link['REMOTERXLEVEL']
    df_link['PMAX'] = df_link['REMOTERXLEVEL']
    _log('Created PMIN and PMAX of REMOTERXLEVEL')
472
    
473
    # convert 3min windows to 15min windows
Fabian Kovac's avatar
Fabian Kovac committed
474
    group_cols = [df_link['BEGINTIME'].dt.round('15Min'), 'RADIOLINKID']
475
476
477
    agg_cols = {
        'PMIN' : 'min',
        'PMAX' : 'max',
478
479
        'REMOTERXLEVEL' : 'mean',
        'TXLEVEL' : 'mean',
480
        'SPEED' : 'mean',
481
482
        'CURRRXBITRATE' : 'mean',
        'CURRTXBITRATE' : 'mean'
483
    }
484
485
486
487
488
489
    df_link = df_link.groupby(group_cols).agg(agg_cols).reset_index()
    _log('Converted 3min windows to 15min windows')
    
    # convert BEGINTIME to RAINLINK format
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.strftime('%Y%m%d%H%M')
    _log('Converted BEGINTIME to RAINLINK format "%Y%m%d%H%M"')
490
491
492
493
494
495
496
497
498
    
    
    # merge config and link dataframe
    drop_cols = ['RADIOLINKID', 'LINKTYPE', 'SITEID_A', 'SITEID_B', 'CAPACITYINTERFACE']
    df_link = pd.merge(df_link, df_config, how = 'inner', left_on = 'RADIOLINKID', right_on = 'LINKID').drop(drop_cols, axis = 1)
    _log('Merged config data to link dataframe')
    
    
    _log('\n******************************** CALC POWER LEVELS *************************')
499
    
500
    # build df with differences of sending and receiving levels
501
    df_diff = df_link[['LINKID', 'TXLEVEL', 'REMOTERXLEVEL']].copy()
502
    df_diff['PINSTMEAN'] = df_diff['TXLEVEL'] - df_diff['REMOTERXLEVEL']
503
504
    _log('Built dataframe with mean link difference levels of TXLEVEL and REMOTERXLEVEL')
    
505
506
    # get mean of differences per link
    df_diff = df_diff.groupby(['LINKID']).agg({'PINSTMEAN' : 'mean'}).reset_index()
507
    
508
    
509
    # merge differences to transmission dataframe
510
511
512
    df_link = pd.merge(df_link, df_diff, how = 'left', left_on = 'LINKID', right_on = 'LINKID')
    _log('Merged mean link difference levels back to link dataframe')
    
513
514
515
516
517
518
519
    
    # calculate instantaneous values from transmitted and received signal strength
    df_link['PINST'] = df_link['TXLEVEL'] - df_link['REMOTERXLEVEL']
    _log('Calculated instantaneous values PINST from transmitted and received signal strength')
    
    # calculate differences from instantaneous values and mean link diff. levels
    df_link['PDIFF'] = df_link['PINST'] - df_link['PINSTMEAN']
520
    _log('Calculated difference levels PDIFF instantaneous values derivating of daily means of dry periods')
521
522
    
    
523
524
525
526
527
    # calculate attenuation of transmitted and received signal strengths per transmission
    df_link['ATTENUATION'] = df_link['REMOTERXLEVEL'] - df_link['TXLEVEL']
    _log('Calculated attenuation per transmission')
    
    # calculate extinction coefficients for links based on attenuation and lengths of the links based on beer-lambert law
528
529
    # Coeff = -ln(10) * (Attenuation/PathLength)
    df_link['EXTINCTIONCOEFF'] = -np.log(10) * (df_link['ATTENUATION'] / df_link['LENGTH'])
530
531
532
    _log('Calculated extinction coefficients based on beer-lambert law')
    
    
533
534
    # rename and reorder columns to aid RAINLINK format
    name_cols = {
535
        'LINKID' : 'ID',
536
537
538
        'BEGINTIME' : 'DateTime',
        'PMIN' : 'Pmin',
        'PMAX' : 'Pmax',
539
        'REMOTERXLEVEL' : 'Pmean',
540
541
542
        'PINST' : 'Pinst',
        'PINSTMEAN' : 'PinstMean',
        'PDIFF' : 'Pdiff',
543
        'TXLEVEL' : 'TxLevel',
544
545
        'ATTENUATION' : 'Attenuation',
        'EXTINCTIONCOEFFICIENT' : 'ExtinctionCoeff',
546
547
        'SPEED' : 'Speed',
        'CURRRXBITRATE' : 'RxBitrate',
548
        'CURRTXBITRATE' : 'TxBitrate',
549
550
        'LONGITUDE_A' : 'XStart',
        'LATITUDE_A' : 'YStart',
551
552
        'LONGITUDE_MID' : 'XMid',
        'LATITUDE_MID' : 'YMid',
553
554
        'LONGITUDE_B' : 'XEnd',
        'LATITUDE_B' : 'YEnd',
555
556
        'LENGTH' : 'PathLength',
        'FREQUENCY' : 'Frequency',
557
    }
558
559
    
    # check if RXFREQUENCY and TXFREQUENCY exists (only present with 2021-05)
560
    if 'TXFREQUENCY' in df_link.columns and 'RXFREQUENCY' in df_link.columns:
561
        name_cols.update({
562
563
            'RXFREQUENCY' : 'RxFrequency',
            'TXFREQUENCY' : 'TxFrequency'
564
565
        })
    
566
567
568
569
    df_link = df_link.rename(columns = name_cols).reindex(columns = list(name_cols.values()))
    _log('Converted link dataframe to RAINLINK format')
    
    
570
571
572
573
574
575
576
577
578
579
580
581
582
583
    _log('\n******************************** MERGE INCA ********************************')
    
    # set INCA RR data for each km of link path
    df_link['RRpath'] = get_inca_data(
        df_link['DateTime'],
        df_link['XStart'],
        df_link['YStart'],
        df_link['XEnd'],
        df_link['YEnd'],
        df_link['PathLength']
     )
    _log('Merged INCA RR data to transmissions')
    
    
584
585
586
587
588
589
590
    _log('\n******************************** SAVE FILES ********************************')
    
    # build path for clean config and transmissions destination files
    dest_config = file_config.with_name(f'{file_config.stem}_clean{file_config.suffix}')
    dest_trans = file_trans.with_name(f'{file_trans.stem}_clean{file_trans.suffix}')
    
    # build path for clean link destination file (same folder, date and extension as transmissions file)
591
    date = str(file_trans.stem).split('_')[-1]
592
    dest_link = pathlib.Path(dest_trans.parents[0], f'LINK_{date}_clean{file_trans.suffix}')
593
594
595
    
    
    # save clean files
596
597
598
599
600
601
602
603
604
    df_config.to_csv(dest_config, sep = ';', header = True, index = False)
    _log(f'Saved clean config file with shape {df_config.shape} to "{str(dest_config)}"')
    df_trans.to_csv(dest_trans, sep = ';', header = True, index = False)
    _log(f'Saved clean transmissions file with shape {df_trans.shape} to "{str(dest_trans)}"')
    df_link.to_csv(dest_link, sep = ';', header = True, index = False)
    _log(f'Saved clean link file with shape {df_link.shape} to "{str(dest_link)}"')


if __name__ == '__main__':
605
606
607
    # flag for data prep
    start_prep = True
    
608
609
610
    # get config and transmissions file from arguments
    args = parse_arguments()
    
611
612
613
    # convert config and transmissions arguments to paths
    file_config = pathlib.Path(args.config)
    file_trans = pathlib.Path(args.transmissions)
614
    dir_inca = pathlib.Path(args.inca)
615
616
617
618
619
620
621
622
    
    # check if config files exists
    if not file_config.exists():
        _log('Invalid path for config file!')
        start_prep = False
        
    # check if transmissions file exists
    if not file_trans.exists():
623
624
625
626
627
628
        _log('Invalid path for transmissions file!')
        start_prep = False
    
    # chec if inca dir exists
    if not dir_inca.exists():
        _log('Invalid path for inca directory!')
629
630
631
632
        start_prep = False
    
    # start prep if flag is True, otherwise exit with code 2
    if start_prep:
633
        prep()
634
    else:
635
        sys.exit(2)