prep.py 21.4 KB
Newer Older
1
2
3
4
#
# Title: Data Preparation for LINK Configs and Transmissions
# Author: Fabian Kovac <ds191008@fhstp.ac.at>
# Team: University of Applied Sciences St. Pölten
5
6
# Version: 1.2
# Last changed: 2021-06-22
7
8
9
#

import sys
10
import gzip
11
12
import pathlib
import argparse
13
import datetime
14
15
16
17
18
19

import numpy as np
import pandas as pd


def parse_arguments() -> argparse.Namespace:
20
    """Parses provided commandline arguments for LINK
21
22
	
	Returns:
23
		args (argparse.Namespace): object with paths to provided files
24
25
26
	"""
    
    # create argument parser with description
27
    desc = '# Data Preparation for LINK\n'
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
    desc += '-'*64 + '\n'
    desc += 'Script outputs the same files with a "_clean" suffix.\n'
    desc += 'Existing clean versions are automatically overwritten!'
    
    parser = argparse.ArgumentParser(
        prog = 'prep.py',
        usage = 'python %(prog)s -c <config_file> -t <transmissions_file>',
        description = desc,
        formatter_class = argparse.RawTextHelpFormatter
    )
    
    # add required argument group
    # add config parameter to parser
    # add transmissions parameter to parser
    required_args = parser.add_argument_group('required arguments')
    required_args.add_argument('-c', '--config', type = str, required = True, help = 'Path to Config-File')
    required_args.add_argument('-t', '--transmissions', type = str, required = True, help = 'Path to Config-File')
45
    required_args.add_argument('-i', '--inca', type = str, required = True, help = 'Path to Inca-Dir')
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    
    # parse arguments
    args = parser.parse_args()
    
    return args


def _log(msg: str) -> None:
    """Logs messages if verbose flag is set to True
    
    Parameters:
        msg (str): Message to log to console
        verbose (bool): Outputs message if set to True
	"""
    
    # add marker to log message
    marker = '%'
    if msg[:1] == '\n':
        msg = f'\n{marker} {msg[1:]}'
    else:
        msg = f'{marker} {msg}'
    
    # print message
    print(msg)


72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def load_inca_file(file_inca: pathlib.Path) -> np.array:
    """Loads loads gzipped INCA data to a given file

    Parameters:
       file_inca (pathlib.Path): Path to INCA file

    Returns:
        x (np.array): Matrix with shape (401, 701)
    """
    
    # open zipped file and bring data to right shape (resolution x,y: 701x401 km2)
    with gzip.open(file_inca, 'rb') as file:
        x = file.read()
        x = np.fromstring(x, sep = ' ')
        x = np.reshape(x, (401, 701))

        return x


def load_inca_data(dir_inca: pathlib.Path) -> np.array:
    """Loads inca files to a given inca dir

    Parameters:
        dir_inca (pathlib.Path): Directory for INCA files

    Returns:
        inca_Data (np.array): Tensor with shape (96, 401, 701)
    """
    
    # initialize tensor with 96 15min intervals
    inca_data = np.zeros((96, 401, 701))

    # load inca dates from inca dir
    for i, file_inca in enumerate(sorted([file for file in dir_inca.iterdir() if file.is_file()])):        
        # load zipped ascii data
        data = load_inca_file(file_inca)
        
        # update inca tensor
        inca_data[i] = data
    
    return inca_data


115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def get_midpoint(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
    """Calculcates the midpoint between two utm coordinate pairs

    Parameters:
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B

    Returns:
        lon, lat (np.array): Vectors with longitudes and latitudes of the midpoints
    """
    
    # convert degrees to radians
    lon_a, lat_a = np.radians(lon_a), np.radians(lat_a)
    lon_b, lat_b = np.radians(lon_b), np.radians(lat_b)
    
    # calculate midpoint
    Bx = np.cos(lat_b) * np.cos(lon_b - lon_a)
    By = np.cos(lat_b) * np.sin(lon_b - lon_a)
    
    lon_mid = lon_a + np.arctan2(By, np.cos(lat_a) + Bx)
    lat_mid = np.arctan2(
        np.sin(lat_a) + np.sin(lat_b),
        np.sqrt(np.square(np.cos(lat_a) + Bx) + np.square(By))
    )
    
    return np.degrees(lon_mid), np.degrees(lat_mid)


145
def get_distance(lon_a: np.array, lat_a: np.array, lon_b: np.array, lat_b: np.array) -> np.array:
146
147
148
    """Calculcates distance between two coordinates in km
    using a rotation-ellipsoid in cartesian coordinates out of polar coordiantes

149
    Parameters:
150
151
152
153
        lon_a (np.array): Longitudes of point A
        lat_a (np.array): Latitudes of point A
        lon_b (np.array): Longitudes of point B
        lat_b (np.array): Latitudes of point B
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

    Returns:
        length (np.array): Vector with distances in km (can directly be assigned a pandas column)
    """

    # constants (euqator radius and pole radius in km)
    # r_equator is the 'semi-major axis' and r_pole the 'semi-minor axis' on a WGS84 ellipsoid
    r_equator = 6378.137
    r_pole = 6356.7523142

    # calculate rotation-ellipsoid in cartesian coordinates out of polar coordiantes
    za = np.sin(np.radians(lat_a)) * r_pole
    ra = np.cos(np.radians(lat_a)) * r_equator
    xa = np.sin(np.radians(lon_a)) * ra
    ya = np.cos(np.radians(lon_a)) * ra

    zb = np.sin(np.radians(lat_b)) * r_pole
    rb = np.cos(np.radians(lat_b)) * r_equator
    xb = np.sin(np.radians(lon_b)) * rb
    yb = np.cos(np.radians(lon_b)) * rb

    # calculate distances between point a and point b
    dx = xa - xb
    dy = ya - yb
    dz = za - zb
    length = np.sqrt(np.square(dx) + np.square(dy) + np.square(dz))

    return length
182
183


184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def utm_to_lambert(lon: np.array, lat: np.array) -> tuple:
    """Convert WGS 84 UTM coordinates to a Lambert Conic Conformal Projection
    
    Parameters:
        lon (np.array): Vector containing longitudes
        lat (np.array): Vector containing latitudes
    
    Returns:
        x, y (tuple(np.array, np.array)): tuple of two vectors with coordinates
	"""
    
    # convert utm coordinates (angles of degree) to radians
    lon = np.radians(lon)
    lat = np.radians(lat)
    
    # define standard parallels according to EPSG:31287 - MGI/Austria Lambert
    # --> see https://epsg.io/31287
    lat1 = np.radians(49)
    lat2 = np.radians(46)
    
    # point of reference:
    #   lon: 13°20'E
    #   lat: 47°30'N
    lon0 = np.radians(13.33333333333333)
    lat0 = np.radians(47.5)
    
    # INCA grid: 701x401 km
    # compensate for point of reference is in the middle of the grid
    # false easting: half of 701km = 350500m
    # false northing: half of 401km = 200500m
    x0 = 350500
    y0 = 200500
    
    # volumetric mean radius of the earth in m
    R = 6371000

    # lambert conformal conic projection:
    # --> see https://mathworld.wolfram.com/LambertConformalConicProjection.html
Fabian Kovac's avatar
Fabian Kovac committed
222
    n = np.log(np.cos(lat1) * (1 / np.cos(lat2))) / np.log(np.tan(np.pi/4 + lat2/2) * (1 / np.tan(np.pi/4 + lat1/2)))
223
    F = (np.cos(lat1) * np.tan(np.pi/4 + lat1/2)**n) / n
Fabian Kovac's avatar
Fabian Kovac committed
224
225
    p = R * F * (1 / np.tan(np.pi/4 + lat/2))**n
    p0 = R * F * (1 / np.tan(np.pi/4 + lat0/2))**n
226
227
228
229
230
231
232
233

    # calculate lambert conic conformal x and y
    x = p * np.sin(n * (lon - lon0)) + x0
    y = p0 - p * np.cos(n * (lon - lon0)) + y0

    return x, y


234
235
def lambert_to_inca_idx(x: np.array, y: np.array) -> tuple:
    """Convert x and y of Lambert Conic Conformal Projection to INCA index
236
237
238
239
240
241
242
243
244
245
246
247
248
    (rounded lambert coordinates)
    
    Parameters:
        x (np.array): Vector containing x values in meter of Lambert Conic Conformal Projection
        y (np.array): Vector containing y values in meter of in Lambert Conic Conformal Projection
    
    Returns:
        ix, iy (tuple(np.array, np.array)): tuple of indices for INCA data
	"""
    
    return np.round(x/1000, decimals = 0).astype(int), np.round(y/1000, decimals = 0).astype(int)


249
250
def get_inca_data(inca_data: np.array, datetimes: np.array, x: np.array, y: np.array) -> np.array:
    """Get INCA RR data based on Lamber Conic Conformal Coordinates
251
252
    
    Parameters:
253
        inca_data (np.array): Tensor containing INCA RR data
254
255
256
257
258
        datetimes (np.array): Vector containing datetimes of transmissions
        x (np.array): Vector containing x values of LINK
        y (np.array): Vector containing y values of LINK
    
    Returns:
259
        inca_RR (np.array): Vector containing INCA RR data based on datetimes and lambert coordinates
260
261
262
263
264
265
	"""
    
    # convert utm coordinates to lambert conic conformal projection
    lccX, lccY = utm_to_lambert(x, y)
    
    # convert lambert coordinates to INCA indices
266
    idx, idy = lambert_to_inca_idx(lccX, lccY)
267
268
269
270
271
272
273
274
275
    
    
    # generate times of day in 15min (window) intervals
    window = 15
    inca_times = sorted([str(i * datetime.timedelta(seconds = window))[2:] for i in range(24*60//window)])
    
    # generate times of LINK data
    link_times = datetimes.map(lambda x: f'{x[-4:-2]}:{x[-2:]}')
    
276
    # get INCA indices of LINK times
277
278
    idx_times = np.searchsorted(inca_times, link_times)
    
279
280
281
282
283
284
285
286
287
288
289
    
    # get INCA RR data based on time indices and lambert coordinates
    inca_RR = inca_data[idx_times, idy, idx]
    
    # TODO: discuss retrieving INCA RR data
    # currently data is based on the exact point of the link
    # maybe get mean RR of 5x5 km grid around point?
    # maybe get RR data based on conv2d with a 5x5 gaussian filter around point?
    # --> let's discuss!
    
    return inca_RR
290
291
292


def prep() -> None:
293
294
295
296
297
    """Data preparation for LINK config and transmissions
    
    Parameters:
        file_config (pathlib.Path): Config File
        file_trans (pathlib.Path): Transmissions File
298
        dir_inca (pathlib.Path): INCA Directory
299
300
301
302
	"""
    
    _log('\n******************************** READ FILES ********************************')
    
303
    # read config and transmission files
304
305
306
307
    df_config = pd.read_csv(file_config, sep = ';')
    _log(f'Read config file with shape {df_config.shape}')
    df_trans = pd.read_csv(file_trans, sep = ';')
    _log(f'Read transmissions file with shape {df_trans.shape}')
308
309
    
    
310
311
312
313
314
315
    _log('\n******************************** BASIC PREP ********************************')
    
    # remove test-link with link id 1
    df_config = df_config[df_config['LINKID'] != 1]
    df_trans = df_trans[df_trans['RADIOLINKID'] != 1]
    _log('Removed all entries of test-link with linkid 1')
316
317
    
    
318
319
320
321
    # drop links that are officially not in use ('na' in CAPACITYINTERFACE and/or FREQUENCY)
    # --> see Q&A Phillip Scheffknecht (05 Feb 2021)
    df_config = df_config.dropna(axis = 0, subset = ['CAPACITYINTERFACE', 'FREQUENCY'])
    _log('Dropped configs with NA in CAPACITYINTERFACE and/or FREQUENCY (links officially not in use)')
322
323
    
    
324
325
    # delete rows with unused link ids
    # get link ids of config and transmissions
326
327
    config_ids = set(df_config['LINKID'].unique().tolist())
    trans_ids = set(df_trans['RADIOLINKID'].unique().tolist())
328
    
329
    # delete link ids in transmissions without config
330
    unused_trans_ids = trans_ids - config_ids
331
332
    df_trans = df_trans[~df_trans['RADIOLINKID'].isin(list(unused_trans_ids))]
    _log('Removed all links in transmissions where no config is present')
333
    
334
    # delete link ids in config without transmissions
335
    unused_config_ids = config_ids - trans_ids
336
337
    df_config = df_config[~df_config['LINKID'].isin(list(unused_config_ids))]
    _log('Removed all links in config where no transmission is present')
338
339
    
    
340
    # delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
341
342
    # gather duplicated rows in config file
    col_subset = ['LINKTYPE', 'SITEID_A', 'LATITUDE_A', 'LONGITUDE_A', 'SITEID_B', 'LATITUDE_B', 'LONGITUDE_B', 'CAPACITYINTERFACE', 'FREQUENCY']
343
    duplicated_config_ids = set(df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist())
344
    
345
    # gather duplicated link ids of config file in transmissions file
346
    found_trans_ids = set(df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist())
347
    
348
    # calculate unused duplicated ids in config file
349
    duplicated_used_ids = duplicated_config_ids - found_trans_ids
350
    
351
    # delete rows with unused duplicated link ids in config file
352
    df_config = df_config[~df_config['LINKID'].isin(list(duplicated_used_ids))]
353
354
    _log('Removed duplicated links which are not in use')
    
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
    
    # check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B) pairs
    # gather unique combinations of link ids and site ids of config
    # (new temporary column is much faster then pandas agg function)
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_A']
    config_loc_tuples = set(df_config['TEMP_LOC_TUPLE'].unique().tolist())
    df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_B']
    config_loc_tuples.update(set(df_config['TEMP_LOC_TUPLE'].unique().tolist()))
    
    # gather unique combinations of link ids and locations in transmissions
    # (new temporary column is much faster then pandas agg function)
    df_trans['TEMP_LOC_TUPLE'] = df_trans['RADIOLINKID'].astype(str) + '--' + df_trans['LOCATION']
    trans_loc_tuples = set(df_trans['TEMP_LOC_TUPLE'].unique().tolist())
    
    # calculate link id - location tuples which are not in config
    invalid_loc_tuples = trans_loc_tuples - config_loc_tuples
    
    # remove invalid tuples from transmissions
    df_trans = df_trans[~df_trans['TEMP_LOC_TUPLE'].isin(list(invalid_loc_tuples))]
    _log('Removed all transmissions with invalid RADIOLINKID - LOCATION tuples (not present in config)')
    
    # remove temp columns
    df_config = df_config.drop(['TEMP_LOC_TUPLE'], axis = 1)
    df_trans = df_trans.drop(['TEMP_LOC_TUPLE'], axis = 1)
    
380
    
381
382
383
384
    # calculate midpoint of links
    df_config['LONGITUDE_MID'], df_config['LATITUDE_MID'] = get_midpoint(df_config['LONGITUDE_A'], df_config['LATITUDE_A'], df_config['LONGITUDE_B'], df_config['LATITUDE_B'])
    _log('Calculated midpoint of links')
    
385
    # calculate LENGTH in km between links
386
    df_config['LENGTH'] = get_distance(df_config['LONGITUDE_A'], df_config['LATITUDE_A'], df_config['LONGITUDE_B'], df_config['LATITUDE_B'])
387
    _log('Calculated distances between sites using a WGS84 ellipsoid')
388
389
390
391
392
393
    
    
    # convert FREQUENCY to float
    df_config['FREQUENCY'] = df_config['FREQUENCY'].map(lambda x: str(x)[:-3]).astype('float')
    _log('Converted FREQUENCY to float')
    
394
395
396
397
398
399
400
401
    # convert RXFREQUENCY and TXFREQUENCY to float and from MHz to GHz
    # check if columns exists (only present with 2021-05)
    if 'RXFREQUENCY' in df_config.columns and 'TXFREQUENCY' in df_config.columns:
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY'].astype('float')
        df_config['RXFREQUENCY'] = df_config['RXFREQUENCY']/1000
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY'].astype('float')
        df_config['TXFREQUENCY'] = df_config['TXFREQUENCY']/1000
        _log('Converted RXFREQUENCY and TXFREQUENCY to float and GHz')
402
    
403
404
    
    # drop transmissions with (operational) status unequal 1
405
406
407
408
409
410
411
412
413
414
    df_trans = df_trans[df_trans['STATUS'] == 1]
    df_trans = df_trans[df_trans['OPERATIONALSTATUS'] == 1]
    _log('Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1')
    
    
    _log('\n******************************** BUILD LINK DF *****************************')
    
    # copy transmissions dataframe to link dataframe
    df_link = df_trans.copy()
    _log('Copy transmissions dataframe to link dataframe')
415
416
    
    
417
    # convert begintime to utc
418
419
420
    df_link['BEGINTIME'] = pd.to_datetime(df_link['BEGINTIME'], format = '%Y-%m-%d %H:%M:%S')
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.tz_localize('Europe/Vienna').dt.tz_convert('UTC').dt.tz_localize(None)
    _log('Converted BEGINTIME to UTC')
421
    
Fabian Kovac's avatar
Fabian Kovac committed
422
423
424
425
426
    # only use transmissions with begintime matching INCA date
    date_inca = np.datetime64(f'{str(dir_inca.stem)[0:4]}-{str(dir_inca.stem)[-4:-2]}-{str(dir_inca.stem)[-2:]}')
    df_link = df_link[df_link['BEGINTIME'].dt.date == date_inca]
    _log('Filtered transmissions to match BEGINTIME with INCA date')
    
427
    
428
429
430
431
    # copy REMOTERXLEVEL to PMIN and PMAX (for aggregation in 15min window conversion)
    df_link['PMIN'] = df_link['REMOTERXLEVEL']
    df_link['PMAX'] = df_link['REMOTERXLEVEL']
    _log('Created PMIN and PMAX of REMOTERXLEVEL')
432
    
433
434
435
436
437
438
439
440
441
    # convert 3min windows to 15min windows
    group_cols = [df_link['BEGINTIME'].dt.floor('15Min'), 'RADIOLINKID']
    agg_cols = {'TXLEVEL' : 'mean', 'REMOTERXLEVEL' : 'mean', 'PMIN' : 'min', 'PMAX' : 'max'}
    df_link = df_link.groupby(group_cols).agg(agg_cols).reset_index()
    _log('Converted 3min windows to 15min windows')
    
    # convert BEGINTIME to RAINLINK format
    df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.strftime('%Y%m%d%H%M')
    _log('Converted BEGINTIME to RAINLINK format "%Y%m%d%H%M"')
442

443
    
444
445
446
447
448
449
450
451
    # build df with differences of sending and receiving levels
    df_diff = df_link[['RADIOLINKID', 'TXLEVEL', 'REMOTERXLEVEL']].copy()
    df_diff['MEANLINKDIFFLEVEL'] = df_diff['TXLEVEL'] - df_diff['REMOTERXLEVEL']
    _log('Built dataframe with mean link difference levels of TXLEVEL and REMOTERXLEVEL')
    
    # get mean of differences
    df_diff = df_diff.groupby(['RADIOLINKID']).agg({'MEANLINKDIFFLEVEL' : 'mean'}).reset_index()
    _log('Merged mean link difference levels back to link dataframe')
452
    
453
454
455
456
    # merge differences to transmission dataframe
    df_link = pd.merge(df_link, df_diff, how = 'inner', left_on = 'RADIOLINKID', right_on = 'RADIOLINKID')
    df_link['DIFFLEVEL'] = df_link['TXLEVEL'] - df_link['REMOTERXLEVEL'] - df_link['MEANLINKDIFFLEVEL']
    _log('Calculated DIFFLEVEL as TXLEVEL - REMOTERXLEVEL - MEANLINKDIFFLEVEL')
457
458
    
    
459
460
461
462
463
464
465
    # merge config and link dataframe
    drop_cols = ['RADIOLINKID', 'LINKTYPE', 'SITEID_A', 'SITEID_B', 'CAPACITYINTERFACE']
    df_link = pd.merge(df_link, df_config, how = 'inner', left_on = 'RADIOLINKID', right_on = 'LINKID').drop(drop_cols, axis = 1)
    _log('Merged config data to link dataframe')
    
    # rename and reorder columns to aid RAINLINK format
    name_cols = {
466
        'LINKID' : 'ID',
467
468
469
        'BEGINTIME' : 'DateTime',
        'PMIN' : 'Pmin',
        'PMAX' : 'Pmax',
470
        'REMOTERXLEVEL' : 'Pmean',
471
472
473
474
475
476
477
        'TXLEVEL' : 'TxLevel',
        'MEANLINKDIFFLEVEL' : 'MeanLinkDiffLevel',
        'DIFFLEVEL' : 'DiffLevel',
        'LONGITUDE_A' : 'XStart',
        'LATITUDE_A' : 'YStart',
        'LONGITUDE_B' : 'XEnd',
        'LATITUDE_B' : 'YEnd',
478
479
        'LONGITUDE_MID' : 'XMid',
        'LATITUDE_MID' : 'YMid',
480
481
        'LENGTH' : 'PathLength',
        'FREQUENCY' : 'Frequency',
482
    }
483
484
485
486
487
488
489
490
    
    # check if RXFREQUENCY and TXFREQUENCY exists (only present with 2021-05)
    if 'RXFREQUENCY' in df_link.columns and 'TXFREQUENCY' in df_link.columns:
        name_cols.update({
            'RXFREQUENCY' : 'RxFrequency',
            'TXFREQUENCY' : 'TxFrequency'
        })
    
491
492
493
494
    df_link = df_link.rename(columns = name_cols).reindex(columns = list(name_cols.values()))
    _log('Converted link dataframe to RAINLINK format')
    
    
495
496
    _log('\n******************************** MERGE INCA ********************************')
    
497
    # load inca data
498
    inca_data = load_inca_data(dir_inca)
Fabian Kovac's avatar
Fabian Kovac committed
499
    _log(f'Loaded INCA data from {str(dir_inca).split("/")[-1]} with shape {inca_data.shape}')
500
    
501
502
503
    # set INCA RR data based on datetime and coordinates
    df_link['RRStart'] = get_inca_data(inca_data, df_link['DateTime'], df_link['XStart'], df_link['YStart'])
    df_link['RREnd'] = get_inca_data(inca_data, df_link['DateTime'], df_link['XEnd'], df_link['YEnd'])
504
    df_link['RRMid'] = get_inca_data(inca_data, df_link['DateTime'], df_link['XMid'], df_link['YMid'])
505
    _log('Merged INCA RR data to LINK dataframe')
506
507
    
    
508
509
510
511
512
513
514
    _log('\n******************************** SAVE FILES ********************************')
    
    # build path for clean config and transmissions destination files
    dest_config = file_config.with_name(f'{file_config.stem}_clean{file_config.suffix}')
    dest_trans = file_trans.with_name(f'{file_trans.stem}_clean{file_trans.suffix}')
    
    # build path for clean link destination file (same folder, date and extension as transmissions file)
515
    date = str(file_trans.stem).split('_')[-1]
516
    dest_link = pathlib.Path(dest_trans.parents[0], f'LINK_{date}_clean{file_trans.suffix}')
517
518
519
    
    
    # save clean files
520
521
522
523
524
525
526
527
528
    df_config.to_csv(dest_config, sep = ';', header = True, index = False)
    _log(f'Saved clean config file with shape {df_config.shape} to "{str(dest_config)}"')
    df_trans.to_csv(dest_trans, sep = ';', header = True, index = False)
    _log(f'Saved clean transmissions file with shape {df_trans.shape} to "{str(dest_trans)}"')
    df_link.to_csv(dest_link, sep = ';', header = True, index = False)
    _log(f'Saved clean link file with shape {df_link.shape} to "{str(dest_link)}"')


if __name__ == '__main__':
529
530
531
    # flag for data prep
    start_prep = True
    
532
533
534
    # get config and transmissions file from arguments
    args = parse_arguments()
    
535
536
537
    # convert config and transmissions arguments to paths
    file_config = pathlib.Path(args.config)
    file_trans = pathlib.Path(args.transmissions)
538
    dir_inca = pathlib.Path(args.inca)
539
540
541
542
543
544
545
546
    
    # check if config files exists
    if not file_config.exists():
        _log('Invalid path for config file!')
        start_prep = False
        
    # check if transmissions file exists
    if not file_trans.exists():
547
548
549
550
551
552
        _log('Invalid path for transmissions file!')
        start_prep = False
    
    # chec if inca dir exists
    if not dir_inca.exists():
        _log('Invalid path for inca directory!')
553
554
555
556
        start_prep = False
    
    # start prep if flag is True, otherwise exit with code 2
    if start_prep:
557
        prep()
558
    else:
559
        sys.exit(2)