prep.py 23.1 KB
Newer Older
1
2
3
4
#
# Title: Data Preparation for LINK Configs and Transmissions
# Author: Fabian Kovac <ds191008@fhstp.ac.at>
# Team: University of Applied Sciences St. Pölten
5
6
# Version: 2.1
# Last changed: 2021-07-05
7
8
9
#

import sys
10
import gzip
11
12
import pathlib
import argparse
13
import datetime
14
15
16
17
18
19

import numpy as np
import pandas as pd


def parse_arguments() -> argparse.Namespace:
20
    """Parses provided commandline arguments for LINK
21
22
	
	Returns:
23
		args (argparse.Namespace): object with paths to provided files
24
25
26
	"""
    
    # create argument parser with description
27
    desc = '# Data Preparation for LINK\n'
28
29
30
31
32
33
    desc += '-'*64 + '\n'
    desc += 'Script outputs the same files with a "_clean" suffix.\n'
    desc += 'Existing clean versions are automatically overwritten!'
    
    parser = argparse.ArgumentParser(
        prog = 'prep.py',
34
        usage = 'python %(prog)s -c <config_file> -t <transmissions_file> -i <inca_dir>',
35
36
37
38
39
40
41
42
43
        description = desc,
        formatter_class = argparse.RawTextHelpFormatter
    )
    
    # add required argument group
    # add config parameter to parser
    # add transmissions parameter to parser
    required_args = parser.add_argument_group('required arguments')
    required_args.add_argument('-c', '--config', type = str, required = True, help = 'Path to Config-File')
Fabian Kovac's avatar
Fabian Kovac committed
44
    required_args.add_argument('-t', '--transmissions', type = str, required = True, help = 'Path to Transmissions-File')
45
    required_args.add_argument('-i', '--inca', type = str, required = True, help = 'Path to Inca-Dir')
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    
    # parse arguments
    args = parser.parse_args()
    
    return args


def _log(msg: str) -> None:
    """Logs messages if verbose flag is set to True
    
    Parameters:
        msg (str): Message to log to console
        verbose (bool): Outputs message if set to True
	"""
    
    # add marker to log message
    marker = '%'
    if msg[:1] == '\n':
        msg = f'\n{marker} {msg[1:]}'
    else:
        msg = f'{marker} {msg}'
    
    # print message
    print(msg)


72
73
def load_inca_file(file_inca: pathlib.Path) -> np.array:
    """Loads loads gzipped INCA data to a given file
Fabian Kovac's avatar
Fabian Kovac committed
74
    
75
76
    Parameters:
       file_inca (pathlib.Path): Path to INCA file
Fabian Kovac's avatar
Fabian Kovac committed
77
    
78
79
80
81
82
83
84
    Returns:
        x (np.array): Matrix with shape (401, 701)
    """
    
    # open zipped file and bring data to right shape (resolution x,y: 701x401 km2)
    with gzip.open(file_inca, 'rb') as file:
        x = file.read()
85
        x = np.fromstring(x, dtype = np.float32, sep = ' ')
86
87
88
89
90
91
92
        x = np.reshape(x, (401, 701))

        return x


def load_inca_data(dir_inca: pathlib.Path) -> np.array:
    """Loads inca files to a given inca dir
Fabian Kovac's avatar
Fabian Kovac committed
93
    
94
95
    Parameters:
        dir_inca (pathlib.Path): Directory for INCA files
Fabian Kovac's avatar
Fabian Kovac committed
96
    
97
98
99
100
101
102
    Returns:
        inca_Data (np.array): Tensor with shape (96, 401, 701)
    """
    
    # initialize tensor with 96 15min intervals
    inca_data = np.zeros((96, 401, 701))
Fabian Kovac's avatar
Fabian Kovac committed
103
    
104
105
106
107
108
109
110
111
112
113
114
    # load inca dates from inca dir
    for i, file_inca in enumerate(sorted([file for file in dir_inca.iterdir() if file.is_file()])):        
        # load zipped ascii data
        data = load_inca_file(file_inca)
        
        # update inca tensor
        inca_data[i] = data
    
    return inca_data


115
116
def get_midpoint(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
    """Calculcates the midpoint between two utm coordinate pairs
Fabian Kovac's avatar
Fabian Kovac committed
117
    
118
119
120
121
122
    Parameters:
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
123
    
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
    Returns:
        lon, lat (np.array): Vectors with longitudes and latitudes of the midpoints
    """
    
    # convert degrees to radians
    lon_a, lat_a = np.radians(lon_a), np.radians(lat_a)
    lon_b, lat_b = np.radians(lon_b), np.radians(lat_b)
    
    # calculate midpoint
    Bx = np.cos(lat_b) * np.cos(lon_b - lon_a)
    By = np.cos(lat_b) * np.sin(lon_b - lon_a)
    
    lon_mid = lon_a + np.arctan2(By, np.cos(lat_a) + Bx)
    lat_mid = np.arctan2(
        np.sin(lat_a) + np.sin(lat_b),
        np.sqrt(np.square(np.cos(lat_a) + Bx) + np.square(By))
    )
    
    return np.degrees(lon_mid), np.degrees(lat_mid)


145
def get_distance(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
Fabian Kovac's avatar
Fabian Kovac committed
146
147
    """Calculcates distance between two coordinates in km using a WGS84 rotation-ellipsoid
    
148
    Parameters:
149
150
151
152
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
153
    
154
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
155
        length (np.array): Vector with distances in km
156
    """
Fabian Kovac's avatar
Fabian Kovac committed
157
158
    
    # constants (equator radius and pole radius in km)
159
160
161
    # r_equator is the 'semi-major axis' and r_pole the 'semi-minor axis' on a WGS84 ellipsoid
    r_equator = 6378.137
    r_pole = 6356.7523142
Fabian Kovac's avatar
Fabian Kovac committed
162
163
164
    
    # calculate points on rotation-ellipsoid
    # point A
165
166
167
168
    za = np.sin(np.radians(lat_a)) * r_pole
    ra = np.cos(np.radians(lat_a)) * r_equator
    xa = np.sin(np.radians(lon_a)) * ra
    ya = np.cos(np.radians(lon_a)) * ra
Fabian Kovac's avatar
Fabian Kovac committed
169
170
    
    # point B
171
172
173
174
    zb = np.sin(np.radians(lat_b)) * r_pole
    rb = np.cos(np.radians(lat_b)) * r_equator
    xb = np.sin(np.radians(lon_b)) * rb
    yb = np.cos(np.radians(lon_b)) * rb
Fabian Kovac's avatar
Fabian Kovac committed
175
176
    
    # calculate differences between A and B
177
178
179
    dx = xa - xb
    dy = ya - yb
    dz = za - zb
Fabian Kovac's avatar
Fabian Kovac committed
180
181
182
183
184
    
    # calculate distances between points using the square root of the sum of squares
    distance = np.sqrt(np.square(dx) + np.square(dy) + np.square(dz))
    	
    return distance
185
186


187
def utm_to_lambert(lon: np.array, lat: np.array) -> tuple:
Fabian Kovac's avatar
Fabian Kovac committed
188
189
    """Convert UTM coordinates to an Austrian Lambert Conic Conformal Projection (lcc)
    EPSG code: 31287, see https://epsg.io/31287
190
191
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
192
193
        lon (np.array): Vector containing utm longitudes
        lat (np.array): Vector containing utm latitudes
194
195
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
196
        x, y (np.array): Vectors containing lcc coordinates
197
198
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
199
    # convert utm coordinates to radians
200
201
    lon = np.radians(lon)
    lat = np.radians(lat)
Fabian Kovac's avatar
Fabian Kovac committed
202

203
204
205
206
207
208
209
210
211
212
    # define standard parallels according to EPSG:31287 - MGI/Austria Lambert
    lat1 = np.radians(49)
    lat2 = np.radians(46)
    
    # point of reference:
    #   lon: 13°20'E
    #   lat: 47°30'N
    lon0 = np.radians(13.33333333333333)
    lat0 = np.radians(47.5)
    
Fabian Kovac's avatar
Fabian Kovac committed
213
    # false easting/northing
214
    # INCA grid: 701x401 km
Fabian Kovac's avatar
Fabian Kovac committed
215
    # compensate for point of reference (middle of the grid)
Fabian Kovac's avatar
Fabian Kovac committed
216
217
218
    # move point of reference (0, 0) to following x and y coordinates (x0, y0) to e.g. get strictly positive coordinates
    # false easting: 3505000m
    # false northing: 200500m
219
220
221
222
223
224
    x0 = 350500
    y0 = 200500
    
    # volumetric mean radius of the earth in m
    R = 6371000

Fabian Kovac's avatar
Fabian Kovac committed
225
226
227
228
    # lambert conformal conic projection as in https://pubs.usgs.gov/pp/1395/report.pdf
    # reference:
    # >>    J. P. Snyder, Map projections--a working manual. Washington: U.S. G.P.O.,
    # >>    For sale by the Supt. of Docs., 1987., pp. 104-110
Fabian Kovac's avatar
Fabian Kovac committed
229
    n = np.log(np.cos(lat1) * (1 / np.cos(lat2))) / np.log(np.tan(np.pi/4 + lat2/2) * (1 / np.tan(np.pi/4 + lat1/2)))
230
    F = (np.cos(lat1) * np.tan(np.pi/4 + lat1/2)**n) / n
Fabian Kovac's avatar
Fabian Kovac committed
231
232
    p = R * F * (1 / np.tan(np.pi/4 + lat/2))**n
    p0 = R * F * (1 / np.tan(np.pi/4 + lat0/2))**n
233

Fabian Kovac's avatar
Fabian Kovac committed
234
    # calculate x and y in lcc
235
236
237
238
239
240
    x = p * np.sin(n * (lon - lon0)) + x0
    y = p0 - p * np.cos(n * (lon - lon0)) + y0

    return x, y


241
242
def lambert_to_inca_idx(x: np.array, y: np.array) -> tuple:
    """Convert x and y of Lambert Conic Conformal Projection to INCA index
243
244
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
245
246
        x (np.array): Vector containing x values in meter of Lambert Conformal Conic Projection
        y (np.array): Vector containing y values in meter of in Lambert Conformal Conic Projection
247
248
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
249
        ix, iy (np.array): indices for INCA data
250
251
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
252
253
254
255
    # convert from m to km
    x /= 1000
    y /= 1000
    
256
257
258
    # convert coordinates to INCA index (nearest coordinates as int)
    ix = np.round(x, decimals = 0).astype(int)
    iy = np.round(y, decimals = 0).astype(int)
Fabian Kovac's avatar
Fabian Kovac committed
259
260
    
    return ix, iy
261
262


263
264
def get_inca_data(datetimes: np.array, lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array, lengths: np.array) -> np.array:
    """Get INCA RR data based for each km of link path
265
266
267
    
    Parameters:
        datetimes (np.array): Vector containing datetimes of transmissions
268
269
270
271
272
        lon_a (np.array): Vector containing longitude values of LINK site a
        lat_a (np.array): Vector containing latitude values of LINK site a
        lon_b (np.array): Vector containing longitude values of LINK site b
        lat_b (np.array): Vector containing latiotude values of LINK site b
        length (np.array): Vector containing distance between sites in km
273
274
    
    Returns:
275
        inca_RR (np.array): Vector containing INCA RR data for each transmission
276
277
	"""
    
278
279
280
    # load inca data
    inca_data = load_inca_data(dir_inca)
    _log(f'Loaded INCA data from {str(dir_inca).split("/")[-1]} with shape {inca_data.shape}')
281
282
283
284
285
    
    # generate times of day in 15min (window) intervals
    window = 15
    inca_times = sorted([str(i * datetime.timedelta(seconds = window))[2:] for i in range(24*60//window)])
    
Fabian Kovac's avatar
Fabian Kovac committed
286
287
288
    # generate times of LINK data (last 4 chars of datetime vector)
    time_func = np.vectorize(lambda x: f'{x[-4:-2]}:{x[-2:]}')
    link_times = time_func(datetimes)
289
    
290
    # get INCA indices of LINK times
291
    idx_times = np.searchsorted(inca_times, link_times)
292
    _log(f'Created search indices vector for each transmission for INCA RR tensor')
293
    
294
    
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
    # convert utm coordinates to lambert conic conformal projection
    xa, ya = utm_to_lambert(lon_a, lat_a)
    xb, yb = utm_to_lambert(lon_b, lat_b)
    
    # create lambert coordinate matrices for start and end points    
    start = np.array([xa, ya]).T
    end = np.array([xb, yb]).T
    
    # define n INCA RR points for each km of link length
    # --> min. points: 3 (start, mid and end) if length < 3km
    n = np.round(lengths, decimals = 0).astype(int)
    n = np.where(n < 3, 3, n)
    
    # set list for INCA RR data
    inca_RR = []
    
    # get n INCA RR data points for each transmission
    # TODO: find vectorized solution without loop
    # --> n is dynamic (diffent lengths of links) and np.linspace() only supports n as a scalar
    for i in range(len(link_times)):        
        # create n points for each km of link
        # --> lambert projection is in meters, therefore linspace equally divides the link into n points (including start, end and mid)
        x, y = np.linspace(start[i], end[i], n[i], axis = 1)
        
        # get nearest inca coordinates for each point on link
        idx, idy = lambert_to_inca_idx(x, y)
        
        # get INCA RR data for each point on link
        inca_RR.append(inca_data[idx_times[i], idy, idx])
324
325
    
    return inca_RR
326
327
328


def prep() -> None:
Fabian Kovac's avatar
Fabian Kovac committed
329
    """Data preparation for LINK config and transmissions"""
330
331
332
    
    _log('\n******************************** READ FILES ********************************')
    
333
    # read config and transmission files
334
335
336
337
    df_config = pd.read_csv(file_config, sep = ';')
    _log(f'Read config file with shape {df_config.shape}')
    df_trans = pd.read_csv(file_trans, sep = ';')
    _log(f'Read transmissions file with shape {df_trans.shape}')
338
339
    
    
340
341
342
343
344
345
    _log('\n******************************** BASIC PREP ********************************')
    
    # remove test-link with link id 1
    df_config = df_config[df_config['LINKID'] != 1]
    df_trans = df_trans[df_trans['RADIOLINKID'] != 1]
    _log('Removed all entries of test-link with linkid 1')
346
347
    
    
348
349
350
351
    # drop links that are officially not in use ('na' in CAPACITYINTERFACE and/or FREQUENCY)
    # --> see Q&A Phillip Scheffknecht (05 Feb 2021)
    df_config = df_config.dropna(axis = 0, subset = ['CAPACITYINTERFACE', 'FREQUENCY'])
    _log('Dropped configs with NA in CAPACITYINTERFACE and/or FREQUENCY (links officially not in use)')
352
353
    
    
354
355
    # delete rows with unused link ids
    # get link ids of config and transmissions
356
357
    config_ids = set(df_config['LINKID'].unique().tolist())
    trans_ids = set(df_trans['RADIOLINKID'].unique().tolist())
358
    
359
    # delete link ids in transmissions without config
360
    unused_trans_ids = trans_ids - config_ids
361
362
    df_trans = df_trans[~df_trans['RADIOLINKID'].isin(list(unused_trans_ids))]
    _log('Removed all links in transmissions where no config is present')
363
    
364
    # delete link ids in config without transmissions
365
    unused_config_ids = config_ids - trans_ids
366
367
    df_config = df_config[~df_config['LINKID'].isin(list(unused_config_ids))]
    _log('Removed all links in config where no transmission is present')
368
369
    
    
370
    # delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
371
372
    # gather duplicated rows in config file
    col_subset = ['LINKTYPE', 'SITEID_A', 'LATITUDE_A', 'LONGITUDE_A', 'SITEID_B', 'LATITUDE_B', 'LONGITUDE_B', 'CAPACITYINTERFACE', 'FREQUENCY']
373
    duplicated_config_ids = set(df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist())
374
    
375
    # gather duplicated link ids of config file in transmissions file
376
    found_trans_ids = set(df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist())
377
    
378
    # calculate unused duplicated ids in config file
379
    duplicated_used_ids = duplicated_config_ids - found_trans_ids
380
    
381
    # delete rows with unused duplicated link ids in config file
382
    df_config = df_config[~df_config['LINKID'].isin(list(duplicated_used_ids))]
383
384
    _log('Removed duplicated links which are not in use')
    
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
    
    # check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B) pairs
    # gather unique combinations of link ids and site ids of config
    # (new temporary column is much faster then pandas agg function)
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_A']
    config_loc_tuples = set(df_config['TEMP_LOC_TUPLE'].unique().tolist())
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_B']
    config_loc_tuples.update(set(df_config['TEMP_LOC_TUPLE'].unique().tolist()))
    
    # gather unique combinations of link ids and locations in transmissions
    # (new temporary column is much faster then pandas agg function)
    df_trans['TEMP_LOC_TUPLE'] = df_trans['RADIOLINKID'].astype(str) + '--' + df_trans['LOCATION']
    trans_loc_tuples = set(df_trans['TEMP_LOC_TUPLE'].unique().tolist())
    
    # calculate link id - location tuples which are not in config
    invalid_loc_tuples = trans_loc_tuples - config_loc_tuples
    
    # remove invalid tuples from transmissions
    df_trans = df_trans[~df_trans['TEMP_LOC_TUPLE'].isin(list(invalid_loc_tuples))]
    _log('Removed all transmissions with invalid RADIOLINKID - LOCATION tuples (not present in config)')
    
    # remove temp columns
    df_config = df_config.drop(['TEMP_LOC_TUPLE'], axis = 1)
    df_trans = df_trans.drop(['TEMP_LOC_TUPLE'], axis = 1)
    
410
    
411
    # calculate midpoint of links
Fabian Kovac's avatar
Fabian Kovac committed
412
413
414
415
    df_config['LONGITUDE_MID'], df_config['LATITUDE_MID'] = get_midpoint(
        df_config['LONGITUDE_A'], df_config['LATITUDE_A'],
        df_config['LONGITUDE_B'], df_config['LATITUDE_B']
    )
416
417
    _log('Calculated midpoint of links')
    
418
    # calculate LENGTH in km between links
419
    df_config['LENGTH'] = get_distance(df_config['LONGITUDE_A'], df_config['LATITUDE_A'], df_config['LONGITUDE_B'], df_config['LATITUDE_B'])
420
    _log('Calculated distances between sites using a WGS84 ellipsoid')
421
422
    
    
Fabian Kovac's avatar
Fabian Kovac committed
423
424
425
426
    # convert CAPACITYINTERFACE to int
    df_config['CAPACITYINTERFACE'] = df_config['CAPACITYINTERFACE'].map(lambda x: str(x)[:-5]).astype('int')
    _log('Converted CAPACITYINTERFACE to int')
    
427
428
429
430
    # convert FREQUENCY to float
    df_config['FREQUENCY'] = df_config['FREQUENCY'].map(lambda x: str(x)[:-3]).astype('float')
    _log('Converted FREQUENCY to float')
    
431
432
    # convert RXFREQUENCY and TXFREQUENCY to float and from MHz to GHz
    # check if columns exists (only present with 2021-05)
433
434
435
    if 'TXFREQUENCY' in df_config.columns and 'RXFREQUENCY' in df_config.columns:
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY'].astype('float')
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY']/1000
436
437
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY'].astype('float')
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY']/1000
438
        _log('Converted RXFREQUENCY and TXFREQUENCY to float and GHz')
439
    
440
441
    
    # drop transmissions with (operational) status unequal 1
442
443
444
445
446
447
448
449
450
451
    df_trans = df_trans[df_trans['STATUS'] == 1]
    df_trans = df_trans[df_trans['OPERATIONALSTATUS'] == 1]
    _log('Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1')
    
    
    _log('\n******************************** BUILD LINK DF *****************************')
    
    # copy transmissions dataframe to link dataframe
    df_link = df_trans.copy()
    _log('Copy transmissions dataframe to link dataframe')
452
453
    
    
454
    # convert begintime to utc
455
456
457
    df_link['BEGINTIME'] = pd.to_datetime(df_link['BEGINTIME'], format = '%Y-%m-%d %H:%M:%S')
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.tz_localize('Europe/Vienna').dt.tz_convert('UTC').dt.tz_localize(None)
    _log('Converted BEGINTIME to UTC')
458
    
Fabian Kovac's avatar
Fabian Kovac committed
459
460
461
462
463
    # only use transmissions with begintime matching INCA date
    date_inca = np.datetime64(f'{str(dir_inca.stem)[0:4]}-{str(dir_inca.stem)[-4:-2]}-{str(dir_inca.stem)[-2:]}')
    df_link = df_link[df_link['BEGINTIME'].dt.date == date_inca]
    _log('Filtered transmissions to match BEGINTIME with INCA date')
    
464
    
465
466
467
468
    # copy REMOTERXLEVEL to PMIN and PMAX (for aggregation in 15min window conversion)
    df_link['PMIN'] = df_link['REMOTERXLEVEL']
    df_link['PMAX'] = df_link['REMOTERXLEVEL']
    _log('Created PMIN and PMAX of REMOTERXLEVEL')
469
    
470
471
    # convert 3min windows to 15min windows
    group_cols = [df_link['BEGINTIME'].dt.floor('15Min'), 'RADIOLINKID']
472
473
474
    agg_cols = {
        'PMIN' : 'min',
        'PMAX' : 'max',
475
476
        'REMOTERXLEVEL' : 'mean',
        'TXLEVEL' : 'mean',
477
        'SPEED' : 'mean',
478
479
        'CURRRXBITRATE' : 'mean',
        'CURRTXBITRATE' : 'mean'
480
    }
481
482
483
484
485
486
    df_link = df_link.groupby(group_cols).agg(agg_cols).reset_index()
    _log('Converted 3min windows to 15min windows')
    
    # convert BEGINTIME to RAINLINK format
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.strftime('%Y%m%d%H%M')
    _log('Converted BEGINTIME to RAINLINK format "%Y%m%d%H%M"')
487

488
    
489
490
    # build df with differences of sending and receiving levels
    df_diff = df_link[['RADIOLINKID', 'TXLEVEL', 'REMOTERXLEVEL']].copy()
491
    df_diff['PINSTMEAN'] = df_diff['TXLEVEL'] - df_diff['REMOTERXLEVEL']
492
493
    _log('Built dataframe with mean link difference levels of TXLEVEL and REMOTERXLEVEL')
    
494
495
    # get mean of differences per link
    df_diff = df_diff.groupby(['RADIOLINKID']).agg({'PINSTMEAN' : 'mean'}).reset_index()
496
    _log('Merged mean link difference levels back to link dataframe')
497
    
498
499
    # merge differences to transmission dataframe
    df_link = pd.merge(df_link, df_diff, how = 'inner', left_on = 'RADIOLINKID', right_on = 'RADIOLINKID')
500
501
502
503
504
505
506
507
508

    
    # calculate instantaneous values from transmitted and received signal strength
    df_link['PINST'] = df_link['TXLEVEL'] - df_link['REMOTERXLEVEL']
    _log('Calculated instantaneous values PINST from transmitted and received signal strength')
    
    # calculate differences from instantaneous values and mean link diff. levels
    df_link['PDIFF'] = df_link['PINST'] - df_link['PINSTMEAN']

509
510
    
    
511
512
513
514
515
516
517
    # merge config and link dataframe
    drop_cols = ['RADIOLINKID', 'LINKTYPE', 'SITEID_A', 'SITEID_B', 'CAPACITYINTERFACE']
    df_link = pd.merge(df_link, df_config, how = 'inner', left_on = 'RADIOLINKID', right_on = 'LINKID').drop(drop_cols, axis = 1)
    _log('Merged config data to link dataframe')
    
    # rename and reorder columns to aid RAINLINK format
    name_cols = {
518
        'LINKID' : 'ID',
519
520
521
        'BEGINTIME' : 'DateTime',
        'PMIN' : 'Pmin',
        'PMAX' : 'Pmax',
522
        'REMOTERXLEVEL' : 'Pmean',
523
524
525
        'PINST' : 'Pinst',
        'PINSTMEAN' : 'PinstMean',
        'PDIFF' : 'Pdiff',
526
        'TXLEVEL' : 'TxLevel',
527
528
        'SPEED' : 'Speed',
        'CURRRXBITRATE' : 'RxBitrate',
529
        'CURRTXBITRATE' : 'TxBitrate',
530
531
        'LONGITUDE_A' : 'XStart',
        'LATITUDE_A' : 'YStart',
532
533
        'LONGITUDE_MID' : 'XMid',
        'LATITUDE_MID' : 'YMid',
534
535
        'LONGITUDE_B' : 'XEnd',
        'LATITUDE_B' : 'YEnd',
536
537
        'LENGTH' : 'PathLength',
        'FREQUENCY' : 'Frequency',
538
    }
539
540
    
    # check if RXFREQUENCY and TXFREQUENCY exists (only present with 2021-05)
541
    if 'TXFREQUENCY' in df_link.columns and 'RXFREQUENCY' in df_link.columns:
542
        name_cols.update({
543
544
            'RXFREQUENCY' : 'RxFrequency',
            'TXFREQUENCY' : 'TxFrequency'
545
546
        })
    
547
548
549
550
    df_link = df_link.rename(columns = name_cols).reindex(columns = list(name_cols.values()))
    _log('Converted link dataframe to RAINLINK format')
    
    
551
552
    _log('\n******************************** MERGE INCA ********************************')
    
553
554
555
    # set INCA RR data for each km of link path
    df_link['RRPath'] = get_inca_data(df_link['DateTime'], df_link['XStart'], df_link['YStart'], df_link['XEnd'], df_link['YEnd'], df_link['PathLength'])
    _log('Merged INCA RR data to transmissions')
556
557
    
    
558
559
560
561
562
563
564
    _log('\n******************************** SAVE FILES ********************************')
    
    # build path for clean config and transmissions destination files
    dest_config = file_config.with_name(f'{file_config.stem}_clean{file_config.suffix}')
    dest_trans = file_trans.with_name(f'{file_trans.stem}_clean{file_trans.suffix}')
    
    # build path for clean link destination file (same folder, date and extension as transmissions file)
565
    date = str(file_trans.stem).split('_')[-1]
566
    dest_link = pathlib.Path(dest_trans.parents[0], f'LINK_{date}_clean{file_trans.suffix}')
567
568
569
    
    
    # save clean files
570
571
572
573
574
575
576
577
578
    df_config.to_csv(dest_config, sep = ';', header = True, index = False)
    _log(f'Saved clean config file with shape {df_config.shape} to "{str(dest_config)}"')
    df_trans.to_csv(dest_trans, sep = ';', header = True, index = False)
    _log(f'Saved clean transmissions file with shape {df_trans.shape} to "{str(dest_trans)}"')
    df_link.to_csv(dest_link, sep = ';', header = True, index = False)
    _log(f'Saved clean link file with shape {df_link.shape} to "{str(dest_link)}"')


if __name__ == '__main__':
579
580
581
    # flag for data prep
    start_prep = True
    
582
583
584
    # get config and transmissions file from arguments
    args = parse_arguments()
    
585
586
587
    # convert config and transmissions arguments to paths
    file_config = pathlib.Path(args.config)
    file_trans = pathlib.Path(args.transmissions)
588
    dir_inca = pathlib.Path(args.inca)
589
590
591
592
593
594
595
596
    
    # check if config files exists
    if not file_config.exists():
        _log('Invalid path for config file!')
        start_prep = False
        
    # check if transmissions file exists
    if not file_trans.exists():
597
598
599
600
601
602
        _log('Invalid path for transmissions file!')
        start_prep = False
    
    # chec if inca dir exists
    if not dir_inca.exists():
        _log('Invalid path for inca directory!')
603
604
605
606
        start_prep = False
    
    # start prep if flag is True, otherwise exit with code 2
    if start_prep:
607
        prep()
608
    else:
609
        sys.exit(2)