prep.py 27.5 KB
Newer Older
1
2
#
# Title: Data Preparation for LINK Configs and Transmissions
Fabian Kovac's avatar
Fabian Kovac committed
3
# Author: Fabian Kovac <fabian.kovac@fhstp.ac.at>
4
# Team: University of Applied Sciences St. Pölten
5
# Version: 4.1
Fabian Kovac's avatar
Fabian Kovac committed
6
# Last changed: 2022-01-21
7
8
9
#

import sys
10
import gzip
11
import json
12
13
import pathlib
import argparse
14
import datetime
15
16
17
18

import numpy as np
import pandas as pd

19
20
from shapely.geometry import Point, Polygon

21
22

def parse_arguments() -> argparse.Namespace:
23
    """Parses provided commandline arguments for LINK
24
25
	
	Returns:
26
		args (argparse.Namespace): object with paths to provided files
27
28
29
	"""
    
    # create argument parser with description
30
    desc = '# Data Preparation for LINK\n'
31
32
33
34
35
36
    desc += '-'*64 + '\n'
    desc += 'Script outputs the same files with a "_clean" suffix.\n'
    desc += 'Existing clean versions are automatically overwritten!'
    
    parser = argparse.ArgumentParser(
        prog = 'prep.py',
37
        usage = 'python %(prog)s -c <config_file> -t <transmissions_file> -i <inca_dir>',
38
39
40
41
42
43
44
45
46
        description = desc,
        formatter_class = argparse.RawTextHelpFormatter
    )
    
    # add required argument group
    # add config parameter to parser
    # add transmissions parameter to parser
    required_args = parser.add_argument_group('required arguments')
    required_args.add_argument('-c', '--config', type = str, required = True, help = 'Path to Config-File')
Fabian Kovac's avatar
Fabian Kovac committed
47
    required_args.add_argument('-t', '--transmissions', type = str, required = True, help = 'Path to Transmissions-File')
48
    required_args.add_argument('-i', '--inca', type = str, required = True, help = 'Path to Inca-Dir')
49
50
51
52
53
54
55
56
    
    # parse arguments
    args = parser.parse_args()
    
    return args


def _log(msg: str) -> None:
57
    """Logs message
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    
    Parameters:
        msg (str): Message to log to console
	"""
    
    # add marker to log message
    marker = '%'
    if msg[:1] == '\n':
        msg = f'\n{marker} {msg[1:]}'
    else:
        msg = f'{marker} {msg}'
    
    # print message
    print(msg)


74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def load_areas(file: pathlib.Path = 'areas.json') -> dict:
    """Load json file with areas and corresponding polygons
    
    Parameters:
       file (pathlib.Path): Path to areas json file
    
    Returns:
        areas_poly (dict): Dict containing areas and corresponding polygons
    """
    
    # area dictionary
    areas_poly = {}

    # load areas geojson file
    with open(file) as f:   
        for area, points in json.load(f).items():        
            transformed = Polygon([tuple(p) for p in points])
            areas_poly.update({area : transformed})
    
    return areas_poly


96
def load_inca_file(file_inca: pathlib.Path) -> np.array:
97
    """Loads gzipped INCA data of a given file
Fabian Kovac's avatar
Fabian Kovac committed
98
    
99
100
    Parameters:
       file_inca (pathlib.Path): Path to INCA file
Fabian Kovac's avatar
Fabian Kovac committed
101
    
102
103
104
105
106
107
108
    Returns:
        x (np.array): Matrix with shape (401, 701)
    """
    
    # open zipped file and bring data to right shape (resolution x,y: 701x401 km2)
    with gzip.open(file_inca, 'rb') as file:
        x = file.read()
109
        x = np.fromstring(x, dtype = np.float32, sep = ' ')
110
111
112
113
114
115
        x = np.reshape(x, (401, 701))

        return x


def load_inca_data(dir_inca: pathlib.Path) -> np.array:
116
    """Loads inca files of a given inca dir
Fabian Kovac's avatar
Fabian Kovac committed
117
    
118
119
    Parameters:
        dir_inca (pathlib.Path): Directory for INCA files
Fabian Kovac's avatar
Fabian Kovac committed
120
    
121
122
123
124
125
126
    Returns:
        inca_Data (np.array): Tensor with shape (96, 401, 701)
    """
    
    # initialize tensor with 96 15min intervals
    inca_data = np.zeros((96, 401, 701))
Fabian Kovac's avatar
Fabian Kovac committed
127
    
128
129
130
131
132
133
134
135
136
137
138
    # load inca dates from inca dir
    for i, file_inca in enumerate(sorted([file for file in dir_inca.iterdir() if file.is_file()])):        
        # load zipped ascii data
        data = load_inca_file(file_inca)
        
        # update inca tensor
        inca_data[i] = data
    
    return inca_data


139
def get_midpoint(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
Fabian Kovac's avatar
Fabian Kovac committed
140
    """Calculcates the midpoint between two wgs 84 coordinate pairs
Fabian Kovac's avatar
Fabian Kovac committed
141
    
142
143
144
145
146
    Parameters:
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
147
    
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    Returns:
        lon, lat (np.array): Vectors with longitudes and latitudes of the midpoints
    """
    
    # convert degrees to radians
    lon_a, lat_a = np.radians(lon_a), np.radians(lat_a)
    lon_b, lat_b = np.radians(lon_b), np.radians(lat_b)
    
    # calculate midpoint
    Bx = np.cos(lat_b) * np.cos(lon_b - lon_a)
    By = np.cos(lat_b) * np.sin(lon_b - lon_a)
    
    lon_mid = lon_a + np.arctan2(By, np.cos(lat_a) + Bx)
    lat_mid = np.arctan2(
        np.sin(lat_a) + np.sin(lat_b),
        np.sqrt(np.square(np.cos(lat_a) + Bx) + np.square(By))
    )
    
    return np.degrees(lon_mid), np.degrees(lat_mid)


Fabian Kovac's avatar
Fabian Kovac committed
169
def get_distance(lon_a: np.array, lat_a: np.array, alt_a: np.array, lon_b: np.array, lat_b: np.array, alt_b: np.array) -> np.array:
170
    """Calculcates distance between two coordinates in km using a WGS84 rotation-ellipsoid
Fabian Kovac's avatar
Fabian Kovac committed
171
    
172
    Parameters:
173
174
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
Fabian Kovac's avatar
Fabian Kovac committed
175
        alt_a (np.array): Altitudes of point A
176
177
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
178
        alt_b (np.array): Altitudes of point B
Fabian Kovac's avatar
Fabian Kovac committed
179
    
180
    Returns:
181
        length (np.array): Vector with distances in km
182
    """
Fabian Kovac's avatar
Fabian Kovac committed
183
    
184
    # constants (equator radius and pole radius in km)
185
    # r_equator is the 'semi-major axis' and r_pole the 'semi-minor axis' on a WGS84 ellipsoid
186
187
    r_equator = 6378.137
    r_pole = 6356.7523142
Fabian Kovac's avatar
Fabian Kovac committed
188
189
190
    
    # calculate points on rotation-ellipsoid
    # point A
191
192
193
194
    za = np.sin(np.radians(lat_a)) * r_pole
    ra = np.cos(np.radians(lat_a)) * r_equator
    xa = np.sin(np.radians(lon_a)) * ra
    ya = np.cos(np.radians(lon_a)) * ra
Fabian Kovac's avatar
Fabian Kovac committed
195
196
    
    # point B
197
198
199
200
    zb = np.sin(np.radians(lat_b)) * r_pole
    rb = np.cos(np.radians(lat_b)) * r_equator
    xb = np.sin(np.radians(lon_b)) * rb
    yb = np.cos(np.radians(lon_b)) * rb
Fabian Kovac's avatar
Fabian Kovac committed
201
202
    
    # calculate differences between A and B
203
204
205
    dx = xa - xb
    dy = ya - yb
    dz = za - zb
Fabian Kovac's avatar
Fabian Kovac committed
206
207
208
    
    # calculate distances between points using the square root of the sum of squares
    distance = np.sqrt(np.square(dx) + np.square(dy) + np.square(dz))
Fabian Kovac's avatar
Fabian Kovac committed
209
210
    
    # incorporate altitudes into the distance
211
    distance = np.sqrt(np.square(distance) + np.square((alt_a/1e3) - (alt_b/1e3)))
Fabian Kovac's avatar
Fabian Kovac committed
212
213
    	
    return distance
214
215


216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def get_link_area(lon: float, lat: float) -> str:
    """Get geographical area depending on the start of the link
    
    Parameters:
        lon (float): Float containing wgs 84 longitudes
        lat (float): Float containing wgs 84 latitudes
    
    Returns:
        area (str): Geographical area the link belongs to
	"""    
    
    # define point with start-coordinates of link
    point = Point(lon, lat)
    
    # get area depending on point
    for area, poly in areas_poly.items():
        if point.within(poly):
            return area
    
    return 'ukn'


Fabian Kovac's avatar
Fabian Kovac committed
238
def wgs84_to_lambert(lon: np.array, lat: np.array) -> tuple:
Fabian Kovac's avatar
Fabian Kovac committed
239
240
    """Convert UTM coordinates to an Austrian Lambert Conic Conformal Projection (lcc)
    EPSG code: 31287, see https://epsg.io/31287
241
242
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
243
244
        lon (np.array): Vector containing wgs 84 longitudes
        lat (np.array): Vector containing wgs 84 latitudes
245
246
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
247
        x, y (np.array): Vectors containing lcc coordinates
248
249
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
250
    # convert wgs 84 coordinates to radians
251
252
    lon = np.radians(lon)
    lat = np.radians(lat)
Fabian Kovac's avatar
Fabian Kovac committed
253

254
255
256
257
258
259
260
261
262
263
    # define standard parallels according to EPSG:31287 - MGI/Austria Lambert
    lat1 = np.radians(49)
    lat2 = np.radians(46)
    
    # point of reference:
    #   lon: 13°20'E
    #   lat: 47°30'N
    lon0 = np.radians(13.33333333333333)
    lat0 = np.radians(47.5)
    
Fabian Kovac's avatar
Fabian Kovac committed
264
    # false easting/northing
265
    # INCA grid: 701x401 km
Fabian Kovac's avatar
Fabian Kovac committed
266
    # compensate for point of reference (middle of the grid)
Fabian Kovac's avatar
Fabian Kovac committed
267
268
269
    # move point of reference (0, 0) to following x and y coordinates (x0, y0) to e.g. get strictly positive coordinates
    # false easting: 3505000m
    # false northing: 200500m
270
271
272
273
274
275
    x0 = 350500
    y0 = 200500
    
    # volumetric mean radius of the earth in m
    R = 6371000

Fabian Kovac's avatar
Fabian Kovac committed
276
277
278
279
    # lambert conformal conic projection as in https://pubs.usgs.gov/pp/1395/report.pdf
    # reference:
    # >>    J. P. Snyder, Map projections--a working manual. Washington: U.S. G.P.O.,
    # >>    For sale by the Supt. of Docs., 1987., pp. 104-110
Fabian Kovac's avatar
Fabian Kovac committed
280
    n = np.log(np.cos(lat1) * (1 / np.cos(lat2))) / np.log(np.tan(np.pi/4 + lat2/2) * (1 / np.tan(np.pi/4 + lat1/2)))
281
    F = (np.cos(lat1) * np.tan(np.pi/4 + lat1/2)**n) / n
Fabian Kovac's avatar
Fabian Kovac committed
282
283
    p = R * F * (1 / np.tan(np.pi/4 + lat/2))**n
    p0 = R * F * (1 / np.tan(np.pi/4 + lat0/2))**n
284

Fabian Kovac's avatar
Fabian Kovac committed
285
    # calculate x and y in lcc
286
287
288
289
290
291
    x = p * np.sin(n * (lon - lon0)) + x0
    y = p0 - p * np.cos(n * (lon - lon0)) + y0

    return x, y


292
def lambert_to_inca_idx(x: np.array, y: np.array) -> tuple:
293
    """Convert x and y of Lambert Conic Conformal Projection to an INCA index
294
295
    
    Parameters:
Fabian Kovac's avatar
Fabian Kovac committed
296
297
        x (np.array): Vector containing x values in meter of Lambert Conformal Conic Projection
        y (np.array): Vector containing y values in meter of in Lambert Conformal Conic Projection
298
299
    
    Returns:
Fabian Kovac's avatar
Fabian Kovac committed
300
        ix, iy (np.array): indices for INCA data
301
302
	"""
    
Fabian Kovac's avatar
Fabian Kovac committed
303
304
305
306
    # convert from m to km
    x /= 1000
    y /= 1000
    
307
308
309
    # convert coordinates to INCA index (nearest coordinates as int)
    ix = np.round(x, decimals = 0).astype(int)
    iy = np.round(y, decimals = 0).astype(int)
Fabian Kovac's avatar
Fabian Kovac committed
310
311
    
    return ix, iy
312
313


314
315
def get_inca_data(datetimes: np.array, lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array, lengths: np.array) -> np.array:
    """Get INCA RR data based for each km of link path
316
317
318
    
    Parameters:
        datetimes (np.array): Vector containing datetimes of transmissions
319
320
321
322
        lon_a (np.array): Vector containing longitude values of LINK site a
        lat_a (np.array): Vector containing latitude values of LINK site a
        lon_b (np.array): Vector containing longitude values of LINK site b
        lat_b (np.array): Vector containing latiotude values of LINK site b
323
        length (np.array): Vector containing distance between sites in km
324
325
    
    Returns:
326
        inca_RR (np.array): Vector containing INCA RR data for each transmission
327
328
	"""
    
329
330
331
    # load inca data
    inca_data = load_inca_data(dir_inca)
    _log(f'Loaded INCA data from {str(dir_inca).split("/")[-1]} with shape {inca_data.shape}')
332
333
334
335
336
    
    # generate times of day in 15min (window) intervals
    window = 15
    inca_times = sorted([str(i * datetime.timedelta(seconds = window))[2:] for i in range(24*60//window)])
    
Fabian Kovac's avatar
Fabian Kovac committed
337
338
339
    # generate times of LINK data (last 4 chars of datetime vector)
    time_func = np.vectorize(lambda x: f'{x[-4:-2]}:{x[-2:]}')
    link_times = time_func(datetimes)
340
    
341
    # get INCA indices of LINK times
342
    idx_times = np.searchsorted(inca_times, link_times)
343
    _log(f'Created search indices vector for each transmission for INCA RR tensor')
344
    
345
    
Fabian Kovac's avatar
Fabian Kovac committed
346
347
348
    # convert wgs 84 coordinates to lambert conic conformal projection
    xa, ya = wgs84_to_lambert(lon_a, lat_a)
    xb, yb = wgs84_to_lambert(lon_b, lat_b)
349
350
351
352
353
354
355
    
    # create lambert coordinate matrices for start and end points    
    start = np.array([xa, ya]).T
    end = np.array([xb, yb]).T
    
    # define n INCA RR points for each km of link length
    # --> min. points: 3 (start, mid and end) if length < 3km
356
    # --> get odd amount of RR points to always include the middle of the link
357
    n = np.round(lengths, decimals = 0).astype(int)
358
    n = np.where(n < 3, 3, n)
359
    n = np.where(n%2 == 0, n+1, n)
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
    
    # set list for INCA RR data
    inca_RR = []
    
    # get n INCA RR data points for each transmission
    # TODO: find vectorized solution without loop
    # --> n is dynamic (diffent lengths of links) and np.linspace() only supports n as a scalar
    for i in range(len(link_times)):        
        # create n points for each km of link
        # --> lambert projection is in meters, therefore linspace equally divides the link into n points (including start, end and mid)
        x, y = np.linspace(start[i], end[i], n[i], axis = 1)
        
        # get nearest inca coordinates for each point on link
        idx, idy = lambert_to_inca_idx(x, y)
        
        # get INCA RR data for each point on link
        inca_RR.append(inca_data[idx_times[i], idy, idx])
377
378
    
    return inca_RR
379
380
381


def prep() -> None:
Fabian Kovac's avatar
Fabian Kovac committed
382
    """Data preparation for LINK config and transmissions"""
383
384
385
    
    _log('\n******************************** READ FILES ********************************')
    
386
    # read config and transmission files
387
388
389
390
    df_config = pd.read_csv(file_config, sep = ';')
    _log(f'Read config file with shape {df_config.shape}')
    df_trans = pd.read_csv(file_trans, sep = ';')
    _log(f'Read transmissions file with shape {df_trans.shape}')
391
392
    
    
393
394
395
396
397
398
    _log('\n******************************** BASIC PREP ********************************')
    
    # remove test-link with link id 1
    df_config = df_config[df_config['LINKID'] != 1]
    df_trans = df_trans[df_trans['RADIOLINKID'] != 1]
    _log('Removed all entries of test-link with linkid 1')
399
400
    
    
401
402
403
    # drop links that are officially not in use ('na' in CAPACITYINTERFACE and/or FREQUENCY)
    # --> see Q&A Phillip Scheffknecht (05 Feb 2021)
    df_config = df_config.dropna(axis = 0, subset = ['CAPACITYINTERFACE', 'FREQUENCY'])
Fabian Kovac's avatar
Fabian Kovac committed
404
405
406
407
408
    
    # check if RXFREQUENCY and TXFREQUENCY exists (only present after 2021-05)
    if 'TXFREQUENCY' in df_config.columns and 'RXFREQUENCY' in df_config.columns:
        df_config = df_config.dropna(axis = 0, subset = ['RXFREQUENCY', 'TXFREQUENCY'])
    
409
    _log('Dropped configs with NA in CAPACITYINTERFACE and/or FREQUENCY (links officially not in use)')
410
411
    
    
412
413
    # delete rows with unused link ids
    # get link ids of config and transmissions
414
415
    config_ids = set(df_config['LINKID'].unique().tolist())
    trans_ids = set(df_trans['RADIOLINKID'].unique().tolist())
416
    
417
    # delete link ids in transmissions without config
418
    unused_trans_ids = trans_ids - config_ids
419
420
    df_trans = df_trans[~df_trans['RADIOLINKID'].isin(list(unused_trans_ids))]
    _log('Removed all links in transmissions where no config is present')
421
    
422
    # delete link ids in config without transmissions
423
    unused_config_ids = config_ids - trans_ids
424
425
    df_config = df_config[~df_config['LINKID'].isin(list(unused_config_ids))]
    _log('Removed all links in config where no transmission is present')
426
427
    
    
428
    # delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
429
430
    # gather duplicated rows in config file
    col_subset = ['LINKTYPE', 'SITEID_A', 'LATITUDE_A', 'LONGITUDE_A', 'SITEID_B', 'LATITUDE_B', 'LONGITUDE_B', 'CAPACITYINTERFACE', 'FREQUENCY']
431
    duplicated_config_ids = set(df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist())
432
    
433
    # gather duplicated link ids of config file in transmissions file
434
    found_trans_ids = set(df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist())
435
    
436
    # calculate unused duplicated ids in config file
437
    duplicated_used_ids = duplicated_config_ids - found_trans_ids
438
    
439
    # delete rows with unused duplicated link ids in config file
440
    df_config = df_config[~df_config['LINKID'].isin(list(duplicated_used_ids))]
441
442
    _log('Removed duplicated links which are not in use')
    
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
    
    # check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B) pairs
    # gather unique combinations of link ids and site ids of config
    # (new temporary column is much faster then pandas agg function)
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_A']
    config_loc_tuples = set(df_config['TEMP_LOC_TUPLE'].unique().tolist())
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_B']
    config_loc_tuples.update(set(df_config['TEMP_LOC_TUPLE'].unique().tolist()))
    
    # gather unique combinations of link ids and locations in transmissions
    # (new temporary column is much faster then pandas agg function)
    df_trans['TEMP_LOC_TUPLE'] = df_trans['RADIOLINKID'].astype(str) + '--' + df_trans['LOCATION']
    trans_loc_tuples = set(df_trans['TEMP_LOC_TUPLE'].unique().tolist())
    
    # calculate link id - location tuples which are not in config
    invalid_loc_tuples = trans_loc_tuples - config_loc_tuples
    
    # remove invalid tuples from transmissions
    df_trans = df_trans[~df_trans['TEMP_LOC_TUPLE'].isin(list(invalid_loc_tuples))]
    _log('Removed all transmissions with invalid RADIOLINKID - LOCATION tuples (not present in config)')
    
    # remove temp columns
    df_config = df_config.drop(['TEMP_LOC_TUPLE'], axis = 1)
    df_trans = df_trans.drop(['TEMP_LOC_TUPLE'], axis = 1)
    
468
    
Fabian Kovac's avatar
Fabian Kovac committed
469
470
471
472
473
474
475
    # check if ANT_HEIGHT_ALTITUDE_A and ANT_HEIGHT_ALTITUDE_B exists (only present after 2021-09)
    # otherwise get altitudes of config file from "VIW_ZAMG_MW_CONFIG_DY_20210928_SH.csv"
    # --> see mail Jürgen Köhler (juergen.koehler@drei.com) at 2021-09-27 16:15
    if 'ANT_HEIGHT_ALTITUDE_A' not in df_config.columns and 'ANT_HEIGHT_ALTITUDE_B' not in df_config.columns:
        df_altitudes = pd.read_csv('files_from_H3A/CONFIG_ALTITUDES.csv', sep = ';')
        df_altitudes = df_altitudes[['LINKID', 'ANT_HEIGHT_ALTITUDE_A', 'ANT_HEIGHT_ALTITUDE_B']]
        df_config = pd.merge(df_config, df_altitudes, how = 'inner', left_on = 'LINKID', right_on = 'LINKID')
476
        del df_altitudes
Fabian Kovac's avatar
Fabian Kovac committed
477
        _log('Added altitude of links')
Fabian Kovac's avatar
Fabian Kovac committed
478
479
    
    
480
    # calculate midpoint of links
Fabian Kovac's avatar
Fabian Kovac committed
481
482
483
484
    df_config['LONGITUDE_MID'], df_config['LATITUDE_MID'] = get_midpoint(
        df_config['LONGITUDE_A'], df_config['LATITUDE_A'],
        df_config['LONGITUDE_B'], df_config['LATITUDE_B']
    )
Fabian Kovac's avatar
Fabian Kovac committed
485
486
    df_config['ANT_HEIGHT_ALTITUDE_MID'] = (df_config['ANT_HEIGHT_ALTITUDE_A'] + df_config['ANT_HEIGHT_ALTITUDE_B']) // 2
    df_config['ANT_HEIGHT_ALTITUDE_MID'] = df_config['ANT_HEIGHT_ALTITUDE_MID'].astype('int')
487
488
    _log('Calculated midpoint of links')
    
489
    # calculate LENGTH in km between links
Fabian Kovac's avatar
Fabian Kovac committed
490
491
492
493
494
495
496
497
    df_config['LENGTH'] = get_distance(
        df_config['LONGITUDE_A'],
        df_config['LATITUDE_A'],
        df_config['ANT_HEIGHT_ALTITUDE_A'],
        df_config['LONGITUDE_B'],
        df_config['LATITUDE_B'],
        df_config['ANT_HEIGHT_ALTITUDE_B']
    )
498
    _log('Calculated distances between sites using a WGS84 ellipsoid')
499
500
    
    
501
502
503
504
    # calculate geographical area of link
    df_config['AREA'] = df_config.apply(lambda row: get_link_area(row['LONGITUDE_A'], row['LATITUDE_A']), axis = 1)
    
    
Fabian Kovac's avatar
Fabian Kovac committed
505
506
507
508
    # convert CAPACITYINTERFACE to int
    df_config['CAPACITYINTERFACE'] = df_config['CAPACITYINTERFACE'].map(lambda x: str(x)[:-5]).astype('int')
    _log('Converted CAPACITYINTERFACE to int')
    
509
510
511
512
    # convert FREQUENCY to float
    df_config['FREQUENCY'] = df_config['FREQUENCY'].map(lambda x: str(x)[:-3]).astype('float')
    _log('Converted FREQUENCY to float')
    
513
514
    # convert RXFREQUENCY and TXFREQUENCY to float and from MHz to GHz
    # check if columns exists (only present with 2021-05)
515
516
    if 'TXFREQUENCY' in df_config.columns and 'RXFREQUENCY' in df_config.columns:
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY'].astype('float')
517
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY']/1e3
518
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY'].astype('float')
519
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY']/1e3
520
        _log('Converted RXFREQUENCY and TXFREQUENCY to float and GHz')
521
    
522
523
    
    # drop transmissions with (operational) status unequal 1
524
525
526
527
528
529
530
531
532
533
    df_trans = df_trans[df_trans['STATUS'] == 1]
    df_trans = df_trans[df_trans['OPERATIONALSTATUS'] == 1]
    _log('Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1')
    
    
    _log('\n******************************** BUILD LINK DF *****************************')
    
    # copy transmissions dataframe to link dataframe
    df_link = df_trans.copy()
    _log('Copy transmissions dataframe to link dataframe')
534
535
    
    
536
    # convert begintime to utc
537
538
539
    df_link['BEGINTIME'] = pd.to_datetime(df_link['BEGINTIME'], format = '%Y-%m-%d %H:%M:%S')
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.tz_localize('Europe/Vienna').dt.tz_convert('UTC').dt.tz_localize(None)
    _log('Converted BEGINTIME to UTC')
540
    
Fabian Kovac's avatar
Fabian Kovac committed
541
542
543
544
545
    # only use transmissions with begintime matching INCA date
    date_inca = np.datetime64(f'{str(dir_inca.stem)[0:4]}-{str(dir_inca.stem)[-4:-2]}-{str(dir_inca.stem)[-2:]}')
    df_link = df_link[df_link['BEGINTIME'].dt.date == date_inca]
    _log('Filtered transmissions to match BEGINTIME with INCA date')
    
546
    
547
    # copy REMOTERXLEVEL to PMIN and PMAX (for aggregation in 15min time window)
548
549
550
    df_link['PMIN'] = df_link['REMOTERXLEVEL']
    df_link['PMAX'] = df_link['REMOTERXLEVEL']
    _log('Created PMIN, PMAX and mean REMOTERXLEVEL')
551
552

    # convert 3min windows to 15min windows to get min, max and mean power levels
553
    group_cols = [df_link['BEGINTIME'].dt.round('15Min'), 'RADIOLINKID']
554
555
556
    agg_cols = {
        'PMIN' : 'min',
        'PMAX' : 'max',
557
558
559
560
561
562
563
        'REMOTERXLEVEL' : 'mean',
        'TXLEVEL' : 'mean',
        'SPEED' : 'median',
        'CURRRXBITRATE' : 'median',
        'CURRTXBITRATE' : 'median',
        'CURRRXPROFILE' : 'median',
        'CURRTXPROFILE' : 'median'
564
    }
565
    df_link = df_link.groupby(group_cols).agg(agg_cols).reset_index()
566
567
568
569
    _log('Converted 3min windows to 15min windows to get min, max and mean power levels')
    
    
    # convert BEGINTIME to format "%Y%m%d%H%M"
570
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.strftime('%Y%m%d%H%M')
571
    _log('Converted BEGINTIME to format "%Y%m%d%H%M"')
572
573
574
575
576
577
578
579
580
    
    
    # merge config and link dataframe
    drop_cols = ['RADIOLINKID', 'LINKTYPE', 'SITEID_A', 'SITEID_B', 'CAPACITYINTERFACE']
    df_link = pd.merge(df_link, df_config, how = 'inner', left_on = 'RADIOLINKID', right_on = 'LINKID').drop(drop_cols, axis = 1)
    _log('Merged config data to link dataframe')
    
    
    _log('\n******************************** CALC POWER LEVELS *************************')
581
    
582
    # load dry LINK attenuation statistics
Fabian Kovac's avatar
Fabian Kovac committed
583
    df_stats = pd.read_csv('files_from_H3A/00_Link_dry_attnstats.csv', sep = ';')
584
    
585
586
587
588
589
    # merge mean and std attenuation to transmission dataframe
    df_link = pd.merge(df_link, df_stats, how = 'left', left_on = 'LINKID', right_on = 'LINKID')
    df_link = df_link.dropna(axis = 0, subset = ['ATTENUATIONMEANDRY', 'ATTENUATIONSTDDRY'])
    del df_stats
    _log('Merged mean and std of link attenuations to link dataframe')
590
    
591
592
    
    
593
594
595
    # calculate attenuation of transmitted and received signal strengths per transmission
    df_link['ATTENUATION'] = df_link['REMOTERXLEVEL'] - df_link['TXLEVEL']
    _log('Calculated attenuation per transmission')
596
597

    # calculate differences from attenuations and mean attenuations
598
599
600
601
602
603
    df_link['ATTENUATIONDIFF'] = df_link['ATTENUATION'] - df_link['ATTENUATIONMEANDRY']
    
    df_std = df_link.groupby(['LINKID']).agg({'ATTENUATION' : 'std'}).reset_index()
    df_std = df_std.rename(columns = {'LINKID' : 'LINKID', 'ATTENUATION' : 'ATTENUATIONSTD'})
    
    df_link = pd.merge(df_link, df_std, how = 'left', left_on = 'LINKID', right_on = 'LINKID')
604
    df_link['ATTENUATIONSTDDIFF'] = np.sqrt(df_link['ATTENUATIONSTD']**2 + df_link['ATTENUATIONSTDDRY']**2)
605
606
    del(df_std)
    _log('Calculated attenuation differences of dry means')
607
608
    
    # calculate extinction coefficients for links based on attenuation and lengths of the links based on beer-lambert law
609
    # Coeff = -ln(10) * (Attenuation/PathLength)
610
    df_link['EXTINCTIONCOEFF'] = -np.log(10) * (df_link['ATTENUATION'] / (df_link['LENGTH']))
611
612
613
    _log('Calculated extinction coefficients based on beer-lambert law')
    
    
614
    # rename and reorder columns
615
    name_cols = {
616
        'LINKID' : 'ID',
617
        'BEGINTIME' : 'DateTime',
618
619
620
        'REMOTERXLEVEL' : 'RxLevel',
        'PMIN' : 'RxLevelMin',
        'PMAX' : 'RxLevelMax',
621
        'TXLEVEL' : 'TxLevel',
622
        'ATTENUATION' : 'Attn',
Fabian Kovac's avatar
Fabian Kovac committed
623
        'ATTENUATIONSTD' : 'AttnStd',
624
625
        'ATTENUATIONMEANDRY' : 'AttnMeanDry',
        'ATTENUATIONSTDDRY' : 'AttnStdDry',
626
        'ATTENUATIONDIFF' : 'AttnDiff',
627
        'ATTENUATIONSTDDIFF' : 'AttnStdDiff',
628
        'EXTINCTIONCOEFF' : 'ExtinctionCoeff',
629
630
        'SPEED' : 'Speed',
        'CURRRXBITRATE' : 'RxBitrate',
631
        'CURRTXBITRATE' : 'TxBitrate',
632
633
        'CURRRXPROFILE' : 'RxProfile',
        'CURRTXPROFILE' : 'TxProfile',
634
635
        'LONGITUDE_A' : 'XStart',
        'LATITUDE_A' : 'YStart',
Fabian Kovac's avatar
Fabian Kovac committed
636
        'ANT_HEIGHT_ALTITUDE_A' : 'AltStart',
637
638
        'LONGITUDE_MID' : 'XMid',
        'LATITUDE_MID' : 'YMid',
Fabian Kovac's avatar
Fabian Kovac committed
639
        'ANT_HEIGHT_ALTITUDE_MID' : 'AltMid',
640
641
        'LONGITUDE_B' : 'XEnd',
        'LATITUDE_B' : 'YEnd',
Fabian Kovac's avatar
Fabian Kovac committed
642
        'ANT_HEIGHT_ALTITUDE_B' : 'AltEnd',
643
644
        'LENGTH' : 'PathLength',
        'FREQUENCY' : 'Frequency',
645
        'AREA' : 'Area'
646
    }
647
    
Fabian Kovac's avatar
Fabian Kovac committed
648
    # check if RXFREQUENCY and TXFREQUENCY exists (only present after 2021-05)
649
    if 'TXFREQUENCY' in df_link.columns and 'RXFREQUENCY' in df_link.columns:
650
        name_cols.update({
651
652
            'RXFREQUENCY' : 'RxFrequency',
            'TXFREQUENCY' : 'TxFrequency'
653
654
        })
    
655
    df_link = df_link.rename(columns = name_cols).reindex(columns = list(name_cols.values()))
656
    _log('Renamed and reordered link dataframe columns')
657
658
    
    
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    _log('\n******************************** MERGE INCA ********************************')
    
    # set INCA RR data for each km of link path
    df_link['RRpath'] = get_inca_data(
        df_link['DateTime'],
        df_link['XStart'],
        df_link['YStart'],
        df_link['XEnd'],
        df_link['YEnd'],
        df_link['PathLength']
     )
    _log('Merged INCA RR data to transmissions')
    
    
673
674
675
    _log('\n******************************** SAVE FILES ********************************')
    
    # build path for clean config and transmissions destination files
676
677
    dest_config = file_config.parent.parent / 'clean' / f'{file_config.stem}{file_config.suffix}'
    dest_trans = file_trans.parent.parent / 'clean' / f'{file_trans.stem}{file_trans.suffix}'
678
679
    
    # build path for clean link destination file (same folder, date and extension as transmissions file)
680
    date = str(file_trans.stem).split('_')[-1]
681
    dest_link = pathlib.Path(dest_trans.parents[0], f'LINK_{date}{file_trans.suffix}')
682
683
684
    
    
    # save clean files
685
686
687
688
689
690
691
692
693
    df_config.to_csv(dest_config, sep = ';', header = True, index = False)
    _log(f'Saved clean config file with shape {df_config.shape} to "{str(dest_config)}"')
    df_trans.to_csv(dest_trans, sep = ';', header = True, index = False)
    _log(f'Saved clean transmissions file with shape {df_trans.shape} to "{str(dest_trans)}"')
    df_link.to_csv(dest_link, sep = ';', header = True, index = False)
    _log(f'Saved clean link file with shape {df_link.shape} to "{str(dest_link)}"')


if __name__ == '__main__':
694
695
696
    # flag for data prep
    start_prep = True
    
697
698
699
    # get config and transmissions file from arguments
    args = parse_arguments()
    
700
701
702
    # convert config and transmissions arguments to paths
    file_config = pathlib.Path(args.config)
    file_trans = pathlib.Path(args.transmissions)
703
    dir_inca = pathlib.Path(args.inca)
704
705
706
707
708
709
710
711
    
    # check if config files exists
    if not file_config.exists():
        _log('Invalid path for config file!')
        start_prep = False
        
    # check if transmissions file exists
    if not file_trans.exists():
712
713
714
715
716
717
        _log('Invalid path for transmissions file!')
        start_prep = False
    
    # chec if inca dir exists
    if not dir_inca.exists():
        _log('Invalid path for inca directory!')
718
719
        start_prep = False
    
720
721
722
    # load areas json file
    areas_poly = load_areas(pathlib.Path('areas.json'))
    
723
724
    # start prep if flag is True, otherwise exit with code 2
    if start_prep:
725
        prep()
726
    else:
727
        sys.exit(2)