Commit da15bf9c authored by Fabian Kovac's avatar Fabian Kovac
Browse files

[f] updated prep.py for link data preparation

parent 09030aee
#
# Title: Data Preparation for LINK Configs and Transmissions
# Author: Fabian Kovac <ds191008@fhstp.ac.at>
# Team: University of Applied Sciences St. Pölten
# Version: 1.0
# Last changed: 2021-06-15
#
import sys
import pathlib
import argparse
import numpy as np
import pandas as pd
def parse_arguments() -> argparse.Namespace:
"""Parses provided commandline arguments for LINK config file and transmissions
Returns:
args (argparse.Namespace): object with paths to provided config- and transmissions-file
"""
# create argument parser with description
desc = '# Data Preparation for LINK Configs and Transmissions\n'
desc += '-'*64 + '\n'
desc += 'Script outputs the same files with a "_clean" suffix.\n'
desc += 'Existing clean versions are automatically overwritten!'
parser = argparse.ArgumentParser(
prog = 'prep.py',
usage = 'python %(prog)s -c <config_file> -t <transmissions_file>',
description = desc,
formatter_class = argparse.RawTextHelpFormatter
)
# add required argument group
# add config parameter to parser
# add transmissions parameter to parser
required_args = parser.add_argument_group('required arguments')
required_args.add_argument('-c', '--config', type = str, required = True, help = 'Path to Config-File')
required_args.add_argument('-t', '--transmissions', type = str, required = True, help = 'Path to Config-File')
# parse arguments
args = parser.parse_args()
return args
def _log(msg: str) -> None:
"""Logs messages if verbose flag is set to True
Parameters:
msg (str): Message to log to console
verbose (bool): Outputs message if set to True
"""
# add marker to log message
marker = '%'
if msg[:1] == '\n':
msg = f'\n{marker} {msg[1:]}'
else:
msg = f'{marker} {msg}'
# print message
print(msg)
def get_distance(lat_a: pd.Series, lon_a: pd.Series, lat_b: pd.Series, lon_b: pd.Series) -> np.array:
"""Calculcates distance between two coordinates in km using the haversine function
Parameters:
lat_a (pd.Series): Latitudes of point A
lon_a (pd.Series): Longitudes of point A
lat_b (pd.Series): Latitudes of point B
lon_b (pd.Series): Longitudes of point B
Returns:
km (np.array): Vector with distances in km (can directly be assigned a pandas column)
"""
# convert latitudes and longitudes to radians
lat_a, lon_a = np.radians(lat_a), np.radians(lon_a)
lat_b, lon_b = np.radians(lat_b), np.radians(lon_b)
# calculate differences of latitudes and longitudes
diff_lat = lat_a - lat_b
diff_lon = lon_a - lon_b
# calculate distances in km between point a and point b
a = np.sin(diff_lat/2)**2 + np.cos(lat_a) * np.cos(lat_b) * np.sin(diff_lon/2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
km = 6371 * c
return km
def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
"""Data preparation for LINK config and transmissions
Parameters:
file_config (pathlib.Path): Config File
file_trans (pathlib.Path): Transmissions File
"""
_log('\n******************************** READ FILES ********************************')
# read files
df_config = pd.read_csv(file_config, sep = ';')
_log(f'Read config file with shape {df_config.shape}')
df_trans = pd.read_csv(file_trans, sep = ';')
_log(f'Read transmissions file with shape {df_trans.shape}')
_log('\n******************************** BASIC PREP ********************************')
# remove test-link with link id 1
df_config = df_config[df_config['LINKID'] != 1]
df_trans = df_trans[df_trans['RADIOLINKID'] != 1]
_log('Removed all entries of test-link with linkid 1')
# drop links that are officially not in use ('na' in CAPACITYINTERFACE and/or FREQUENCY)
# --> see Q&A Phillip Scheffknecht (05 Feb 2021)
df_config = df_config.dropna(axis = 0, subset = ['CAPACITYINTERFACE', 'FREQUENCY'])
_log('Dropped configs with NA in CAPACITYINTERFACE and/or FREQUENCY (links officially not in use)')
# delete rows with unused link ids
# get link ids of config and transmissions
config_ids = df_config['LINKID'].unique().tolist()
trans_ids = df_trans['RADIOLINKID'].unique().tolist()
# delete link ids in transmissions without config
unused_trans_ids = set(trans_ids) - set(config_ids)
df_trans = df_trans[~df_trans['RADIOLINKID'].isin(list(unused_trans_ids))]
_log('Removed all links in transmissions where no config is present')
# delete link ids in config without transmissions
unused_config_ids = set(config_ids) - set(trans_ids)
df_config = df_config[~df_config['LINKID'].isin(list(unused_config_ids))]
_log('Removed all links in config where no transmission is present')
# delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
# gather duplicated rows in config file
col_subset = ['LINKTYPE', 'SITEID_A', 'LATITUDE_A', 'LONGITUDE_A', 'SITEID_B', 'LATITUDE_B', 'LONGITUDE_B', 'CAPACITYINTERFACE', 'FREQUENCY']
duplicated_config_ids = df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist()
# gather duplicated link ids of config file in transmissions file
found_trans_ids = df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist()
# calculate unused duplicated ids in config file
duplicated_unused_ids = set(duplicated_config_ids) - set(found_trans_ids)
# delete rows with unused duplicated link ids in config file
df_config = df_config[~df_config['LINKID'].isin(list(duplicated_unused_ids))]
_log('Removed duplicated links which are not in use')
# calculate LENGTH in km between links using the haversine function
df_config['LENGTH'] = get_distance(df_config['LATITUDE_A'], df_config['LONGITUDE_A'], df_config['LATITUDE_B'], df_config['LONGITUDE_B'])
_log('Calculate distance between sites using the haversine function')
# convert FREQUENCY to float
df_config['FREQUENCY'] = df_config['FREQUENCY'].map(lambda x: str(x)[:-3]).astype('float')
_log('Converted FREQUENCY to float')
# TODO: drop transmissions with (operational) status unequal 1?
# check occurences with: df_trans[(df_trans['STATUS'] != 1) | (df_trans['OPERATIONALSTATUS'] != 1)].shape
df_trans = df_trans[df_trans['STATUS'] == 1]
df_trans = df_trans[df_trans['OPERATIONALSTATUS'] == 1]
_log('Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1')
# TODO: check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B)
# TODO: check foreach BEGINTIME if foreach RADIOLINKID two rows with different LOCATATIONS are present
# TODO: check if (RADIOLINKID - LOCATION_A - LOCATION_B) triple equals (LINKID - SITEID_A - SITEID_B) triple
# should we add checks (a preparation pipeline) regarding site and location data?
# SYSTEMNAME, IFNAME and BEGINTIME are identifiers for a unique link to a given timestamp (passed data quality assessment)
# --> see Q&A field descriptions Oliver Eigner (05 Feb 2021)
_log('\n******************************** BUILD LINK DF *****************************')
# copy transmissions dataframe to link dataframe
df_link = df_trans.copy()
_log('Copy transmissions dataframe to link dataframe')
# convert begintime to utc
df_link['BEGINTIME'] = pd.to_datetime(df_link['BEGINTIME'], format = '%Y-%m-%d %H:%M:%S')
df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.tz_localize('Europe/Vienna').dt.tz_convert('UTC').dt.tz_localize(None)
_log('Converted BEGINTIME to UTC')
# copy REMOTERXLEVEL to PMIN and PMAX (for aggregation in 15min window conversion)
df_link['PMIN'] = df_link['REMOTERXLEVEL']
df_link['PMAX'] = df_link['REMOTERXLEVEL']
_log('Created PMIN and PMAX of REMOTERXLEVEL')
# convert 3min windows to 15min windows
group_cols = [df_link['BEGINTIME'].dt.floor('15Min'), 'RADIOLINKID']
agg_cols = {'TXLEVEL' : 'mean', 'REMOTERXLEVEL' : 'mean', 'PMIN' : 'min', 'PMAX' : 'max'}
df_link = df_link.groupby(group_cols).agg(agg_cols).reset_index()
_log('Converted 3min windows to 15min windows')
# convert BEGINTIME to RAINLINK format
df_link['BEGINTIME'] = df_link['BEGINTIME'].dt.strftime('%Y%m%d%H%M')
_log('Converted BEGINTIME to RAINLINK format "%Y%m%d%H%M"')
# build df with differences of sending and receiving levels
df_diff = df_link[['RADIOLINKID', 'TXLEVEL', 'REMOTERXLEVEL']].copy()
df_diff['MEANLINKDIFFLEVEL'] = df_diff['TXLEVEL'] - df_diff['REMOTERXLEVEL']
_log('Built dataframe with mean link difference levels of TXLEVEL and REMOTERXLEVEL')
# get mean of differences
df_diff = df_diff.groupby(['RADIOLINKID']).agg({'MEANLINKDIFFLEVEL' : 'mean'}).reset_index()
_log('Merged mean link difference levels back to link dataframe')
# merge differences to transmission dataframe
df_link = pd.merge(df_link, df_diff, how = 'inner', left_on = 'RADIOLINKID', right_on = 'RADIOLINKID')
df_link['DIFFLEVEL'] = df_link['TXLEVEL'] - df_link['REMOTERXLEVEL'] - df_link['MEANLINKDIFFLEVEL']
_log('Calculated DIFFLEVEL as TXLEVEL - REMOTERXLEVEL - MEANLINKDIFFLEVEL')
# merge config and link dataframe
drop_cols = ['RADIOLINKID', 'LINKTYPE', 'SITEID_A', 'SITEID_B', 'CAPACITYINTERFACE']
df_link = pd.merge(df_link, df_config, how = 'inner', left_on = 'RADIOLINKID', right_on = 'LINKID').drop(drop_cols, axis = 1)
_log('Merged config data to link dataframe')
# rename and reorder columns to aid RAINLINK format
name_cols = {
'FREQUENCY' : 'Frequency',
'BEGINTIME' : 'DateTime',
'PMIN' : 'Pmin',
'PMAX' : 'Pmax',
'TXLEVEL' : 'TxLevel',
'REMOTERXLEVEL' : 'RemoteRxLevel',
'MEANLINKDIFFLEVEL' : 'MeanLinkDiffLevel',
'DIFFLEVEL' : 'DiffLevel',
'LENGTH' : 'PathLength',
'LONGITUDE_A' : 'XStart',
'LATITUDE_A' : 'YStart',
'LONGITUDE_B' : 'XEnd',
'LATITUDE_B' : 'YEnd',
'LINKID' : 'ID'
}
df_link = df_link.rename(columns = name_cols).reindex(columns = list(name_cols.values()))
_log('Converted link dataframe to RAINLINK format')
_log('\n******************************** SAVE FILES ********************************')
# build path for clean config and transmissions destination files
dest_config = file_config.with_name(f'{file_config.stem}_clean{file_config.suffix}')
dest_trans = file_trans.with_name(f'{file_trans.stem}_clean{file_trans.suffix}')
# build path for clean link destination file (same folder, date and extension as transmissions file)
date = str(file_trans.stem)[-10:]
dest_link = pathlib.Path(dest_trans.parents[0], f'LINK_{date}_clean{file_trans.suffix}')
# save cleaned files
df_config.to_csv(dest_config, sep = ';', header = True, index = False)
_log(f'Saved clean config file with shape {df_config.shape} to "{str(dest_config)}"')
df_trans.to_csv(dest_trans, sep = ';', header = True, index = False)
_log(f'Saved clean transmissions file with shape {df_trans.shape} to "{str(dest_trans)}"')
df_link.to_csv(dest_link, sep = ';', header = True, index = False)
_log(f'Saved clean link file with shape {df_link.shape} to "{str(dest_link)}"')
if __name__ == '__main__':
# get config and transmissions file from arguments
args = parse_arguments()
try:
# convert config and transmissions arguments to paths
file_config = pathlib.Path(args.config)
file_trans = pathlib.Path(args.transmissions)
except:
# print error and exit with code 2 (command line syntax error) if paths are invalid
print('Invalid path for config and/or transmissions file!')
sys.exit(2)
# start data preparation
prep(file_config, file_trans)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment