prep.py 26.1 KB
Newer Older
1
2
3
4
#
# Title: Data Preparation for LINK Configs and Transmissions
# Author: Fabian Kovac <ds191008@fhstp.ac.at>
# Team: University of Applied Sciences St. Pölten
5
6
# Version: 2.9
# Last changed: 2021-10-06
7
8
9
#

import sys
10
import gzip
11
12
import pathlib
import argparse
13
import datetime
14
15
16
17
18
19

import numpy as np
import pandas as pd


def parse_arguments() -> argparse.Namespace:
20
    """Parses provided commandline arguments for LINK
21
22
	
	Returns:
23
		args (argparse.Namespace): object with paths to provided files
24
25
26
	"""
    
    # create argument parser with description
27
    desc = '# Data Preparation for LINK\n'
28
29
30
31
32
33
    desc += '-'*64 + '\n'
    desc += 'Script outputs the same files with a "_clean" suffix.\n'
    desc += 'Existing clean versions are automatically overwritten!'
    
    parser = argparse.ArgumentParser(
        prog = 'prep.py',
34
        usage = 'python %(prog)s -c <config_file> -t <transmissions_file> -i <inca_dir>',
35
36
37
38
39
40
41
42
43
        description = desc,
        formatter_class = argparse.RawTextHelpFormatter
    )
    
    # add required argument group
    # add config parameter to parser
    # add transmissions parameter to parser
    required_args = parser.add_argument_group('required arguments')
    required_args.add_argument('-c', '--config', type = str, required = True, help = 'Path to Config-File')
Fabian Kovac's avatar
Fabian Kovac committed
44
    required_args.add_argument('-t', '--transmissions', type = str, required = True, help = 'Path to Transmissions-File')
45
    required_args.add_argument('-i', '--inca', type = str, required = True, help = 'Path to Inca-Dir')
46
47
48
49
50
51
52
53
    
    # parse arguments
    args = parser.parse_args()
    
    return args


def _log(msg: str) -> None:
54
    """Logs message
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    
    Parameters:
        msg (str): Message to log to console
	"""
    
    # add marker to log message
    marker = '%'
    if msg[:1] == '\n':
        msg = f'\n{marker} {msg[1:]}'
    else:
        msg = f'{marker} {msg}'
    
    # print message
    print(msg)


71
def load_inca_file(file_inca: pathlib.Path) -> np.array:
72
    """Loads gzipped INCA data of a given file
Fabian Kovac's avatar
Fabian Kovac committed
73
    
74
75
    Parameters:
       file_inca (pathlib.Path): Path to INCA file
Fabian Kovac's avatar
Fabian Kovac committed
76
    
77
78
79
80
81
82
83
    Returns:
        x (np.array): Matrix with shape (401, 701)
    """
    
    # open zipped file and bring data to right shape (resolution x,y: 701x401 km2)
    with gzip.open(file_inca, 'rb') as file:
        x = file.read()
84
        x = np.fromstring(x, dtype = np.float32, sep = ' ')
85
86
87
88
89
90
        x = np.reshape(x, (401, 701))

        return x


def load_inca_data(dir_inca: pathlib.Path) -> np.array:
91
    """Loads inca files of a given inca dir
Fabian Kovac's avatar
Fabian Kovac committed
92
    
93
94
    Parameters:
        dir_inca (pathlib.Path): Directory for INCA files
Fabian Kovac's avatar
Fabian Kovac committed
95
    
96
97
98
99
100
101
    Returns:
        inca_Data (np.array): Tensor with shape (96, 401, 701)
    """
    
    # initialize tensor with 96 15min intervals
    inca_data = np.zeros((96, 401, 701))
Fabian Kovac's avatar
Fabian Kovac committed
102
    
103
104
105
106
107
108
109
110
111
112
113
    # load inca dates from inca dir
    for i, file_inca in enumerate(sorted([file for file in dir_inca.iterdir() if file.is_file()])):        
        # load zipped ascii data
        data = load_inca_file(file_inca)
        
        # update inca tensor
        inca_data[i] = data
    
    return inca_data


114
def get_midpoint(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
Fabian Kovac's avatar
Fabian Kovac committed
115
    """Calculcates the midpoint between two wgs 84 coordinate pairs
Fabian Kovac's avatar
Fabian Kovac committed
116
    
117
118
119
120
121
    Parameters:
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
122
    
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
    Returns:
        lon, lat (np.array): Vectors with longitudes and latitudes of the midpoints
    """
    
    # convert degrees to radians
    lon_a, lat_a = np.radians(lon_a), np.radians(lat_a)
    lon_b, lat_b = np.radians(lon_b), np.radians(lat_b)
    
    # calculate midpoint
    Bx = np.cos(lat_b) * np.cos(lon_b - lon_a)
    By = np.cos(lat_b) * np.sin(lon_b - lon_a)
    
    lon_mid = lon_a + np.arctan2(By, np.cos(lat_a) + Bx)
    lat_mid = np.arctan2(
        np.sin(lat_a) + np.sin(lat_b),
        np.sqrt(np.square(np.cos(lat_a) + Bx) + np.square(By))
    )
    
    return np.degrees(lon_mid), np.degrees(lat_mid)


Fabian Kovac's avatar
Fabian Kovac committed
144
145
def get_distance(lon_a: np.array, lat_a: np.array, alt_a: np.array, lon_b: np.array, lat_b: np.array, alt_b: np.array) -> np.array:
    """Calculcates distance between two coordinates in m using a WGS84 rotation-ellipsoid
Fabian Kovac's avatar
Fabian Kovac committed
146
    
147
    Parameters:
148
149
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
Fabian Kovac's avatar
Fabian Kovac committed
150
        alt_a (np.array): Altitudes of point A
151
152
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
153
        alt_b (np.array): Altitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
154
    
155
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
156
        length (np.array): Vector with distances in m
157
    """
Fabian Kovac's avatar
Fabian Kovac committed
158
    
Fabian Kovac's avatar
Fabian Kovac committed
159
    # constants (equator radius and pole radius in m)
160
    # r_equator is the 'semi-major axis' and r_pole the 'semi-minor axis' on a WGS84 ellipsoid
Fabian Kovac's avatar
Fabian Kovac committed
161
162
    r_equator = 6378137
    r_pole = 6356752.3142
Fabian Kovac's avatar
Fabian Kovac committed
163
164
165
    
    # calculate points on rotation-ellipsoid
    # point A
166
167
168
169
    za = np.sin(np.radians(lat_a)) * r_pole
    ra = np.cos(np.radians(lat_a)) * r_equator
    xa = np.sin(np.radians(lon_a)) * ra
    ya = np.cos(np.radians(lon_a)) * ra
Fabian Kovac's avatar
Fabian Kovac committed
170
171
    
    # point B
172
173
174
175
    zb = np.sin(np.radians(lat_b)) * r_pole
    rb = np.cos(np.radians(lat_b)) * r_equator
    xb = np.sin(np.radians(lon_b)) * rb
    yb = np.cos(np.radians(lon_b)) * rb
Fabian Kovac's avatar
Fabian Kovac committed
176
177
    
    # calculate differences between A and B
178
179
180
    dx = xa - xb
    dy = ya - yb
    dz = za - zb
Fabian Kovac's avatar
Fabian Kovac committed
181
182
183
    
    # calculate distances between points using the square root of the sum of squares
    distance = np.sqrt(np.square(dx) + np.square(dy) + np.square(dz))
Fabian Kovac's avatar
Fabian Kovac committed
184
185
186
    
    # incorporate altitudes into the distance
    distance = np.sqrt(np.square(distance) + np.square(alt_a - alt_b))
Fabian Kovac's avatar
Fabian Kovac committed
187
188
    	
    return distance
189
190


Fabian Kovac's avatar
Fabian Kovac committed
191
def wgs84_to_lambert(lon: np.array, lat: np.array) -> tuple:
Fabian Kovac's avatar
Fabian Kovac committed
192
193
    """Convert UTM coordinates to an Austrian Lambert Conic Conformal Projection (lcc)
    EPSG code: 31287, see https://epsg.io/31287
194
195
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
196
197
        lon (np.array): Vector containing wgs 84 longitudes
        lat (np.array): Vector containing wgs 84 latitudes
198
199
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
200
        x, y (np.array): Vectors containing lcc coordinates
201
202
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
203
    # convert wgs 84 coordinates to radians
204
205
    lon = np.radians(lon)
    lat = np.radians(lat)
Fabian Kovac's avatar
Fabian Kovac committed
206

207
208
209
210
211
212
213
214
215
216
    # define standard parallels according to EPSG:31287 - MGI/Austria Lambert
    lat1 = np.radians(49)
    lat2 = np.radians(46)
    
    # point of reference:
    #   lon: 13°20'E
    #   lat: 47°30'N
    lon0 = np.radians(13.33333333333333)
    lat0 = np.radians(47.5)
    
Fabian Kovac's avatar
Fabian Kovac committed
217
    # false easting/northing
218
    # INCA grid: 701x401 km
Fabian Kovac's avatar
Fabian Kovac committed
219
    # compensate for point of reference (middle of the grid)
Fabian Kovac's avatar
Fabian Kovac committed
220
221
222
    # move point of reference (0, 0) to following x and y coordinates (x0, y0) to e.g. get strictly positive coordinates
    # false easting: 3505000m
    # false northing: 200500m
223
224
225
226
227
228
    x0 = 350500
    y0 = 200500
    
    # volumetric mean radius of the earth in m
    R = 6371000

Fabian Kovac's avatar
Fabian Kovac committed
229
230
231
232
    # lambert conformal conic projection as in https://pubs.usgs.gov/pp/1395/report.pdf
    # reference:
    # >>    J. P. Snyder, Map projections--a working manual. Washington: U.S. G.P.O.,
    # >>    For sale by the Supt. of Docs., 1987., pp. 104-110
Fabian Kovac's avatar
Fabian Kovac committed
233
    n = np.log(np.cos(lat1) * (1 / np.cos(lat2))) / np.log(np.tan(np.pi/4 + lat2/2) * (1 / np.tan(np.pi/4 + lat1/2)))
234
    F = (np.cos(lat1) * np.tan(np.pi/4 + lat1/2)**n) / n
Fabian Kovac's avatar
Fabian Kovac committed
235
236
    p = R * F * (1 / np.tan(np.pi/4 + lat/2))**n
    p0 = R * F * (1 / np.tan(np.pi/4 + lat0/2))**n
237

Fabian Kovac's avatar
Fabian Kovac committed
238
    # calculate x and y in lcc
239
240
241
242
243
244
    x = p * np.sin(n * (lon - lon0)) + x0
    y = p0 - p * np.cos(n * (lon - lon0)) + y0

    return x, y


245
def lambert_to_inca_idx(x: np.array, y: np.array) -> tuple:
246
    """Convert x and y of Lambert Conic Conformal Projection to an INCA index
247
248
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
249
250
        x (np.array): Vector containing x values in meter of Lambert Conformal Conic Projection
        y (np.array): Vector containing y values in meter of in Lambert Conformal Conic Projection
251
252
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
253
        ix, iy (np.array): indices for INCA data
254
255
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
256
257
258
259
    # convert from m to km
    x /= 1000
    y /= 1000
    
260
261
262
    # convert coordinates to INCA index (nearest coordinates as int)
    ix = np.round(x, decimals = 0).astype(int)
    iy = np.round(y, decimals = 0).astype(int)
Fabian Kovac's avatar
Fabian Kovac committed
263
264
    
    return ix, iy
265
266


267
268
def get_inca_data(datetimes: np.array, lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array, lengths: np.array) -> np.array:
    """Get INCA RR data based for each km of link path
269
270
271
    
    Parameters:
        datetimes (np.array): Vector containing datetimes of transmissions
272
273
274
275
        lon_a (np.array): Vector containing longitude values of LINK site a
        lat_a (np.array): Vector containing latitude values of LINK site a
        lon_b (np.array): Vector containing longitude values of LINK site b
        lat_b (np.array): Vector containing latiotude values of LINK site b
Fabian Kovac's avatar
Fabian Kovac committed
276
        length (np.array): Vector containing distance between sites in m
277
278
    
    Returns:
279
        inca_RR (np.array): Vector containing INCA RR data for each transmission
280
281
	"""
    
282
283
284
    # load inca data
    inca_data = load_inca_data(dir_inca)
    _log(f'Loaded INCA data from {str(dir_inca).split("/")[-1]} with shape {inca_data.shape}')
285
286
287
288
289
    
    # generate times of day in 15min (window) intervals
    window = 15
    inca_times = sorted([str(i * datetime.timedelta(seconds = window))[2:] for i in range(24*60//window)])
    
Fabian Kovac's avatar
Fabian Kovac committed
290
291
292
    # generate times of LINK data (last 4 chars of datetime vector)
    time_func = np.vectorize(lambda x: f'{x[-4:-2]}:{x[-2:]}')
    link_times = time_func(datetimes)
293
    
294
    # get INCA indices of LINK times
295
    idx_times = np.searchsorted(inca_times, link_times)
296
    _log(f'Created search indices vector for each transmission for INCA RR tensor')
297
    
298
    
Fabian Kovac's avatar
Fabian Kovac committed
299
300
301
    # convert wgs 84 coordinates to lambert conic conformal projection
    xa, ya = wgs84_to_lambert(lon_a, lat_a)
    xb, yb = wgs84_to_lambert(lon_b, lat_b)
302
303
304
305
306
307
308
    
    # create lambert coordinate matrices for start and end points    
    start = np.array([xa, ya]).T
    end = np.array([xb, yb]).T
    
    # define n INCA RR points for each km of link length
    # --> min. points: 3 (start, mid and end) if length < 3km
309
    # --> get odd amount of RR points to always include the middle of the link
Fabian Kovac's avatar
Fabian Kovac committed
310
    n = np.round(lengths/1000, decimals = 0).astype(int)
311
    n = np.where(n < 3, 3, n)
312
    n = np.where(n%2 == 0, n+1, n)
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
    
    # set list for INCA RR data
    inca_RR = []
    
    # get n INCA RR data points for each transmission
    # TODO: find vectorized solution without loop
    # --> n is dynamic (diffent lengths of links) and np.linspace() only supports n as a scalar
    for i in range(len(link_times)):        
        # create n points for each km of link
        # --> lambert projection is in meters, therefore linspace equally divides the link into n points (including start, end and mid)
        x, y = np.linspace(start[i], end[i], n[i], axis = 1)
        
        # get nearest inca coordinates for each point on link
        idx, idy = lambert_to_inca_idx(x, y)
        
        # get INCA RR data for each point on link
        inca_RR.append(inca_data[idx_times[i], idy, idx])
330
331
    
    return inca_RR
332
333
334


def prep() -> None:
Fabian Kovac's avatar
Fabian Kovac committed
335
    """Data preparation for LINK config and transmissions"""
336
337
338
    
    _log('\n******************************** READ FILES ********************************')
    
339
    # read config and transmission files
340
341
342
343
    df_config = pd.read_csv(file_config, sep = ';')
    _log(f'Read config file with shape {df_config.shape}')
    df_trans = pd.read_csv(file_trans, sep = ';')
    _log(f'Read transmissions file with shape {df_trans.shape}')
344
345
    
    
346
347
348
349
350
351
    _log('\n******************************** BASIC PREP ********************************')
    
    # remove test-link with link id 1
    df_config = df_config[df_config['LINKID'] != 1]
    df_trans = df_trans[df_trans['RADIOLINKID'] != 1]
    _log('Removed all entries of test-link with linkid 1')
352
353
    
    
354
355
356
    # drop links that are officially not in use ('na' in CAPACITYINTERFACE and/or FREQUENCY)
    # --> see Q&A Phillip Scheffknecht (05 Feb 2021)
    df_config = df_config.dropna(axis = 0, subset = ['CAPACITYINTERFACE', 'FREQUENCY'])
Fabian Kovac's avatar
Fabian Kovac committed
357
358
359
360
361
    
    # check if RXFREQUENCY and TXFREQUENCY exists (only present after 2021-05)
    if 'TXFREQUENCY' in df_config.columns and 'RXFREQUENCY' in df_config.columns:
        df_config = df_config.dropna(axis = 0, subset = ['RXFREQUENCY', 'TXFREQUENCY'])
    
362
    _log('Dropped configs with NA in CAPACITYINTERFACE and/or FREQUENCY (links officially not in use)')
363
364
    
    
365
366
    # delete rows with unused link ids
    # get link ids of config and transmissions
367
368
    config_ids = set(df_config['LINKID'].unique().tolist())
    trans_ids = set(df_trans['RADIOLINKID'].unique().tolist())
369
    
370
    # delete link ids in transmissions without config
371
    unused_trans_ids = trans_ids - config_ids
372
373
    df_trans = df_trans[~df_trans['RADIOLINKID'].isin(list(unused_trans_ids))]
    _log('Removed all links in transmissions where no config is present')
374
    
375
    # delete link ids in config without transmissions
376
    unused_config_ids = config_ids - trans_ids
377
378
    df_config = df_config[~df_config['LINKID'].isin(list(unused_config_ids))]
    _log('Removed all links in config where no transmission is present')
379
380
    
    
381
    # delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
382
383
    # gather duplicated rows in config file
    col_subset = ['LINKTYPE', 'SITEID_A', 'LATITUDE_A', 'LONGITUDE_A', 'SITEID_B', 'LATITUDE_B', 'LONGITUDE_B', 'CAPACITYINTERFACE', 'FREQUENCY']
384
    duplicated_config_ids = set(df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist())
385
    
386
    # gather duplicated link ids of config file in transmissions file
387
    found_trans_ids = set(df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist())
388
    
389
    # calculate unused duplicated ids in config file
390
    duplicated_used_ids = duplicated_config_ids - found_trans_ids
391
    
392
    # delete rows with unused duplicated link ids in config file
393
    df_config = df_config[~df_config['LINKID'].isin(list(duplicated_used_ids))]
394
395
    _log('Removed duplicated links which are not in use')
    
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
    
    # check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B) pairs
    # gather unique combinations of link ids and site ids of config
    # (new temporary column is much faster then pandas agg function)
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_A']
    config_loc_tuples = set(df_config['TEMP_LOC_TUPLE'].unique().tolist())
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_B']
    config_loc_tuples.update(set(df_config['TEMP_LOC_TUPLE'].unique().tolist()))
    
    # gather unique combinations of link ids and locations in transmissions
    # (new temporary column is much faster then pandas agg function)
    df_trans['TEMP_LOC_TUPLE'] = df_trans['RADIOLINKID'].astype(str) + '--' + df_trans['LOCATION']
    trans_loc_tuples = set(df_trans['TEMP_LOC_TUPLE'].unique().tolist())
    
    # calculate link id - location tuples which are not in config
    invalid_loc_tuples = trans_loc_tuples - config_loc_tuples
    
    # remove invalid tuples from transmissions
    df_trans = df_trans[~df_trans['TEMP_LOC_TUPLE'].isin(list(invalid_loc_tuples))]
    _log('Removed all transmissions with invalid RADIOLINKID - LOCATION tuples (not present in config)')
    
    # remove temp columns
    df_config = df_config.drop(['TEMP_LOC_TUPLE'], axis = 1)
    df_trans = df_trans.drop(['TEMP_LOC_TUPLE'], axis = 1)
    
421
    
Fabian Kovac's avatar
Fabian Kovac committed
422
423
424
425
426
427
428
    # check if ANT_HEIGHT_ALTITUDE_A and ANT_HEIGHT_ALTITUDE_B exists (only present after 2021-09)
    # otherwise get altitudes of config file from "VIW_ZAMG_MW_CONFIG_DY_20210928_SH.csv"
    # --> see mail Jürgen Köhler (juergen.koehler@drei.com) at 2021-09-27 16:15
    if 'ANT_HEIGHT_ALTITUDE_A' not in df_config.columns and 'ANT_HEIGHT_ALTITUDE_B' not in df_config.columns:
        df_altitudes = pd.read_csv('files_from_H3A/CONFIG_ALTITUDES.csv', sep = ';')
        df_altitudes = df_altitudes[['LINKID', 'ANT_HEIGHT_ALTITUDE_A', 'ANT_HEIGHT_ALTITUDE_B']]
        df_config = pd.merge(df_config, df_altitudes, how = 'inner', left_on = 'LINKID', right_on = 'LINKID')
Fabian Kovac's avatar
Fabian Kovac committed
429
        _log('Added altitude of links')
Fabian Kovac's avatar
Fabian Kovac committed
430
431
    
    
432
    # calculate midpoint of links
Fabian Kovac's avatar
Fabian Kovac committed
433
434
435
436
    df_config['LONGITUDE_MID'], df_config['LATITUDE_MID'] = get_midpoint(
        df_config['LONGITUDE_A'], df_config['LATITUDE_A'],
        df_config['LONGITUDE_B'], df_config['LATITUDE_B']
    )
Fabian Kovac's avatar
Fabian Kovac committed
437
438
    df_config['ANT_HEIGHT_ALTITUDE_MID'] = (df_config['ANT_HEIGHT_ALTITUDE_A'] + df_config['ANT_HEIGHT_ALTITUDE_B']) // 2
    df_config['ANT_HEIGHT_ALTITUDE_MID'] = df_config['ANT_HEIGHT_ALTITUDE_MID'].astype('int')
439
440
    _log('Calculated midpoint of links')
    
Fabian Kovac's avatar
Fabian Kovac committed
441
    # calculate LENGTH in m between links
Fabian Kovac's avatar
Fabian Kovac committed
442
443
444
445
446
447
448
449
    df_config['LENGTH'] = get_distance(
        df_config['LONGITUDE_A'],
        df_config['LATITUDE_A'],
        df_config['ANT_HEIGHT_ALTITUDE_A'],
        df_config['LONGITUDE_B'],
        df_config['LATITUDE_B'],
        df_config['ANT_HEIGHT_ALTITUDE_B']
    )
450
    _log('Calculated distances between sites using a WGS84 ellipsoid')
451
452
    
    
Fabian Kovac's avatar
Fabian Kovac committed
453
454
455
456
    # convert CAPACITYINTERFACE to int
    df_config['CAPACITYINTERFACE'] = df_config['CAPACITYINTERFACE'].map(lambda x: str(x)[:-5]).astype('int')
    _log('Converted CAPACITYINTERFACE to int')
    
457
458
459
460
    # convert FREQUENCY to float
    df_config['FREQUENCY'] = df_config['FREQUENCY'].map(lambda x: str(x)[:-3]).astype('float')
    _log('Converted FREQUENCY to float')
    
461
462
    # convert RXFREQUENCY and TXFREQUENCY to float and from MHz to GHz
    # check if columns exists (only present with 2021-05)
463
464
465
    if 'TXFREQUENCY' in df_config.columns and 'RXFREQUENCY' in df_config.columns:
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY'].astype('float')
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY']/1000
466
467
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY'].astype('float')
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY']/1000
468
        _log('Converted RXFREQUENCY and TXFREQUENCY to float and GHz')
469
    
470
471
    
    # drop transmissions with (operational) status unequal 1
472
473
474
475
476
477
478
479
480
481
    df_trans = df_trans[df_trans['STATUS'] == 1]
    df_trans = df_trans[df_trans['OPERATIONALSTATUS'] == 1]
    _log('Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1')
    
    
    _log('\n******************************** BUILD LINK DF *****************************')
    
    # copy transmissions dataframe to link dataframe
    df_link = df_trans.copy()
    _log('Copy transmissions dataframe to link dataframe')
482
483
    
    
484
    # convert begintime to utc
485
486
487
    df_link['BEGINTIME'] = pd.to_datetime(df_link['BEGINTIME'], format = '%Y-%m-%d %H:%M:%S')
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.tz_localize('Europe/Vienna').dt.tz_convert('UTC').dt.tz_localize(None)
    _log('Converted BEGINTIME to UTC')
488
    
Fabian Kovac's avatar
Fabian Kovac committed
489
    # fix 15min timelag in transmissions
490
491
492
    df_link['BEGINTIME'] = df_link['BEGINTIME'] - pd.Timedelta(15, unit = 'min')
    _log('Fixed 15min timelag in transmissions')
    
Fabian Kovac's avatar
Fabian Kovac committed
493
494
495
496
497
    # only use transmissions with begintime matching INCA date
    date_inca = np.datetime64(f'{str(dir_inca.stem)[0:4]}-{str(dir_inca.stem)[-4:-2]}-{str(dir_inca.stem)[-2:]}')
    df_link = df_link[df_link['BEGINTIME'].dt.date == date_inca]
    _log('Filtered transmissions to match BEGINTIME with INCA date')
    
498
    
499
500
501
502
    # copy REMOTERXLEVEL to PMIN and PMAX (for aggregation in 15min window conversion)
    df_link['PMIN'] = df_link['REMOTERXLEVEL']
    df_link['PMAX'] = df_link['REMOTERXLEVEL']
    _log('Created PMIN and PMAX of REMOTERXLEVEL')
503
    
504
    # convert 3min windows to 15min windows
Fabian Kovac's avatar
Fabian Kovac committed
505
    group_cols = [df_link['BEGINTIME'].dt.round('15Min'), 'RADIOLINKID']
506
507
508
    agg_cols = {
        'PMIN' : 'min',
        'PMAX' : 'max',
509
510
        'REMOTERXLEVEL' : 'mean',
        'TXLEVEL' : 'mean',
511
        'SPEED' : 'mean',
512
        'CURRRXBITRATE' : 'mean',
513
514
515
        'CURRTXBITRATE' : 'mean',
        'CURRRXPROFILE' : 'median',
        'CURRTXPROFILE' : 'median'
516
    }
517
518
519
520
521
522
    df_link = df_link.groupby(group_cols).agg(agg_cols).reset_index()
    _log('Converted 3min windows to 15min windows')
    
    # convert BEGINTIME to RAINLINK format
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.strftime('%Y%m%d%H%M')
    _log('Converted BEGINTIME to RAINLINK format "%Y%m%d%H%M"')
523
524
525
526
527
528
529
530
531
    
    
    # merge config and link dataframe
    drop_cols = ['RADIOLINKID', 'LINKTYPE', 'SITEID_A', 'SITEID_B', 'CAPACITYINTERFACE']
    df_link = pd.merge(df_link, df_config, how = 'inner', left_on = 'RADIOLINKID', right_on = 'LINKID').drop(drop_cols, axis = 1)
    _log('Merged config data to link dataframe')
    
    
    _log('\n******************************** CALC POWER LEVELS *************************')
532
    
533
    # build df with differences of sending and receiving levels
534
    df_diff = df_link[['LINKID', 'TXLEVEL', 'REMOTERXLEVEL']].copy()
535
    df_diff['PINSTMEAN'] = df_diff['TXLEVEL'] - df_diff['REMOTERXLEVEL']
536
537
    _log('Built dataframe with mean link difference levels of TXLEVEL and REMOTERXLEVEL')
    
538
539
    # get mean of differences per link
    df_diff = df_diff.groupby(['LINKID']).agg({'PINSTMEAN' : 'mean'}).reset_index()
540
    
541
    
542
    # merge differences to transmission dataframe
543
544
545
    df_link = pd.merge(df_link, df_diff, how = 'left', left_on = 'LINKID', right_on = 'LINKID')
    _log('Merged mean link difference levels back to link dataframe')
    
546
547
548
549
550
551
552
    
    # calculate instantaneous values from transmitted and received signal strength
    df_link['PINST'] = df_link['TXLEVEL'] - df_link['REMOTERXLEVEL']
    _log('Calculated instantaneous values PINST from transmitted and received signal strength')
    
    # calculate differences from instantaneous values and mean link diff. levels
    df_link['PDIFF'] = df_link['PINST'] - df_link['PINSTMEAN']
553
    _log('Calculated difference levels PDIFF instantaneous values derivating of daily means of dry periods')
554
555
    
    
556
557
558
559
560
    # calculate attenuation of transmitted and received signal strengths per transmission
    df_link['ATTENUATION'] = df_link['REMOTERXLEVEL'] - df_link['TXLEVEL']
    _log('Calculated attenuation per transmission')
    
    # calculate extinction coefficients for links based on attenuation and lengths of the links based on beer-lambert law
561
    # Coeff = -ln(10) * (Attenuation/PathLength)
562
    df_link['EXTINCTIONCOEFF'] = -np.log(10) * (df_link['ATTENUATION'] / (df_link['LENGTH']/1000))
563
564
565
    _log('Calculated extinction coefficients based on beer-lambert law')
    
    
566
567
    # rename and reorder columns to aid RAINLINK format
    name_cols = {
568
        'LINKID' : 'ID',
569
570
571
        'BEGINTIME' : 'DateTime',
        'PMIN' : 'Pmin',
        'PMAX' : 'Pmax',
572
        'REMOTERXLEVEL' : 'Pmean',
573
574
575
        'PINST' : 'Pinst',
        'PINSTMEAN' : 'PinstMean',
        'PDIFF' : 'Pdiff',
576
        'TXLEVEL' : 'TxLevel',
577
        'ATTENUATION' : 'Attenuation',
578
        'EXTINCTIONCOEFF' : 'ExtinctionCoeff',
579
580
        'SPEED' : 'Speed',
        'CURRRXBITRATE' : 'RxBitrate',
581
        'CURRTXBITRATE' : 'TxBitrate',
582
583
        'CURRRXPROFILE' : 'RxProfile',
        'CURRTXPROFILE' : 'TxProfile',
584
585
        'LONGITUDE_A' : 'XStart',
        'LATITUDE_A' : 'YStart',
Fabian Kovac's avatar
Fabian Kovac committed
586
        'ANT_HEIGHT_ALTITUDE_A' : 'AltStart',
587
588
        'LONGITUDE_MID' : 'XMid',
        'LATITUDE_MID' : 'YMid',
Fabian Kovac's avatar
Fabian Kovac committed
589
        'ANT_HEIGHT_ALTITUDE_MID' : 'AltMid',
590
591
        'LONGITUDE_B' : 'XEnd',
        'LATITUDE_B' : 'YEnd',
Fabian Kovac's avatar
Fabian Kovac committed
592
        'ANT_HEIGHT_ALTITUDE_B' : 'AltEnd',
593
594
        'LENGTH' : 'PathLength',
        'FREQUENCY' : 'Frequency',
595
    }
596
    
Fabian Kovac's avatar
Fabian Kovac committed
597
    # check if RXFREQUENCY and TXFREQUENCY exists (only present after 2021-05)
598
    if 'TXFREQUENCY' in df_link.columns and 'RXFREQUENCY' in df_link.columns:
599
        name_cols.update({
600
601
            'RXFREQUENCY' : 'RxFrequency',
            'TXFREQUENCY' : 'TxFrequency'
602
603
        })
    
604
605
606
607
    df_link = df_link.rename(columns = name_cols).reindex(columns = list(name_cols.values()))
    _log('Converted link dataframe to RAINLINK format')
    
    
608
609
610
611
612
613
614
615
616
617
618
619
620
621
    _log('\n******************************** MERGE INCA ********************************')
    
    # set INCA RR data for each km of link path
    df_link['RRpath'] = get_inca_data(
        df_link['DateTime'],
        df_link['XStart'],
        df_link['YStart'],
        df_link['XEnd'],
        df_link['YEnd'],
        df_link['PathLength']
     )
    _log('Merged INCA RR data to transmissions')
    
    
622
623
624
625
626
627
628
    _log('\n******************************** SAVE FILES ********************************')
    
    # build path for clean config and transmissions destination files
    dest_config = file_config.with_name(f'{file_config.stem}_clean{file_config.suffix}')
    dest_trans = file_trans.with_name(f'{file_trans.stem}_clean{file_trans.suffix}')
    
    # build path for clean link destination file (same folder, date and extension as transmissions file)
629
    date = str(file_trans.stem).split('_')[-1]
630
    dest_link = pathlib.Path(dest_trans.parents[0], f'LINK_{date}_clean{file_trans.suffix}')
631
632
633
    
    
    # save clean files
634
635
636
637
638
639
640
641
642
    df_config.to_csv(dest_config, sep = ';', header = True, index = False)
    _log(f'Saved clean config file with shape {df_config.shape} to "{str(dest_config)}"')
    df_trans.to_csv(dest_trans, sep = ';', header = True, index = False)
    _log(f'Saved clean transmissions file with shape {df_trans.shape} to "{str(dest_trans)}"')
    df_link.to_csv(dest_link, sep = ';', header = True, index = False)
    _log(f'Saved clean link file with shape {df_link.shape} to "{str(dest_link)}"')


if __name__ == '__main__':
643
644
645
    # flag for data prep
    start_prep = True
    
646
647
648
    # get config and transmissions file from arguments
    args = parse_arguments()
    
649
650
651
    # convert config and transmissions arguments to paths
    file_config = pathlib.Path(args.config)
    file_trans = pathlib.Path(args.transmissions)
652
    dir_inca = pathlib.Path(args.inca)
653
654
655
656
657
658
659
660
    
    # check if config files exists
    if not file_config.exists():
        _log('Invalid path for config file!')
        start_prep = False
        
    # check if transmissions file exists
    if not file_trans.exists():
661
662
663
664
665
666
        _log('Invalid path for transmissions file!')
        start_prep = False
    
    # chec if inca dir exists
    if not dir_inca.exists():
        _log('Invalid path for inca directory!')
667
668
669
670
        start_prep = False
    
    # start prep if flag is True, otherwise exit with code 2
    if start_prep:
671
        prep()
672
    else:
673
        sys.exit(2)