Commit bad77c5b authored by Fabian Kovac's avatar Fabian Kovac
Browse files

[i] check for invalid linkid-location tuples

parent 9f4dbd8d
......@@ -138,16 +138,16 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
# delete rows with unused link ids
# get link ids of config and transmissions
config_ids = df_config['LINKID'].unique().tolist()
trans_ids = df_trans['RADIOLINKID'].unique().tolist()
config_ids = set(df_config['LINKID'].unique().tolist())
trans_ids = set(df_trans['RADIOLINKID'].unique().tolist())
# delete link ids in transmissions without config
unused_trans_ids = set(trans_ids) - set(config_ids)
unused_trans_ids = trans_ids - config_ids
df_trans = df_trans[~df_trans['RADIOLINKID'].isin(list(unused_trans_ids))]
_log('Removed all links in transmissions where no config is present')
# delete link ids in config without transmissions
unused_config_ids = set(config_ids) - set(trans_ids)
unused_config_ids = config_ids - trans_ids
df_config = df_config[~df_config['LINKID'].isin(list(unused_config_ids))]
_log('Removed all links in config where no transmission is present')
......@@ -155,18 +155,43 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
# delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
# gather duplicated rows in config file
col_subset = ['LINKTYPE', 'SITEID_A', 'LATITUDE_A', 'LONGITUDE_A', 'SITEID_B', 'LATITUDE_B', 'LONGITUDE_B', 'CAPACITYINTERFACE', 'FREQUENCY']
duplicated_config_ids = df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist()
duplicated_config_ids = set(df_config[df_config.duplicated(subset = col_subset)]['LINKID'].unique().tolist())
# gather duplicated link ids of config file in transmissions file
found_trans_ids = df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist()
found_trans_ids = set(df_trans[df_trans['RADIOLINKID'].isin(duplicated_config_ids)]['RADIOLINKID'].unique().tolist())
# calculate unused duplicated ids in config file
duplicated_unused_ids = set(duplicated_config_ids) - set(found_trans_ids)
duplicated_used_ids = duplicated_config_ids - found_trans_ids
# delete rows with unused duplicated link ids in config file
df_config = df_config[~df_config['LINKID'].isin(list(duplicated_unused_ids))]
df_config = df_config[~df_config['LINKID'].isin(list(duplicated_used_ids))]
_log('Removed duplicated links which are not in use')
# check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B) pairs
# gather unique combinations of link ids and site ids of config
# (new temporary column is much faster then pandas agg function)
df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_A']
config_loc_tuples = set(df_config['TEMP_LOC_TUPLE'].unique().tolist())
df_config['TEMP_LOC_TUPLE'] = df_config['LINKID'].astype(str) + '--' + df_config['SITEID_B']
config_loc_tuples.update(set(df_config['TEMP_LOC_TUPLE'].unique().tolist()))
# gather unique combinations of link ids and locations in transmissions
# (new temporary column is much faster then pandas agg function)
df_trans['TEMP_LOC_TUPLE'] = df_trans['RADIOLINKID'].astype(str) + '--' + df_trans['LOCATION']
trans_loc_tuples = set(df_trans['TEMP_LOC_TUPLE'].unique().tolist())
# calculate link id - location tuples which are not in config
invalid_loc_tuples = trans_loc_tuples - config_loc_tuples
# remove invalid tuples from transmissions
df_trans = df_trans[~df_trans['TEMP_LOC_TUPLE'].isin(list(invalid_loc_tuples))]
_log('Removed all transmissions with invalid RADIOLINKID - LOCATION tuples (not present in config)')
# remove temp columns
df_config = df_config.drop(['TEMP_LOC_TUPLE'], axis = 1)
df_trans = df_trans.drop(['TEMP_LOC_TUPLE'], axis = 1)
# calculate LENGTH in km between links
df_config['LENGTH'] = get_distance(df_config['LATITUDE_A'], df_config['LONGITUDE_A'], df_config['LATITUDE_B'], df_config['LONGITUDE_B'])
......@@ -177,6 +202,15 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
df_config['FREQUENCY'] = df_config['FREQUENCY'].map(lambda x: str(x)[:-3]).astype('float')
_log('Converted FREQUENCY to float')
# convert RXFREQUENCY and TXFREQUENCY to float and from MHz to GHz
# check if columns exists (only present with 2021-05)
if 'RXFREQUENCY' in df_config.columns and 'TXFREQUENCY' in df_config.columns:
df_config['RXFREQUENCY'] = df_config['RXFREQUENCY'].astype('float')
df_config['RXFREQUENCY'] = df_config['RXFREQUENCY']/1000
df_config['TXFREQUENCY'] = df_config['TXFREQUENCY'].astype('float')
df_config['TXFREQUENCY'] = df_config['TXFREQUENCY']/1000
_log('Converted RXFREQUENCY and TXFREQUENCY to float and GHz')
# TODO: drop transmissions with (operational) status unequal 1?
# check occurences with: df_trans[(df_trans['STATUS'] != 1) | (df_trans['OPERATIONALSTATUS'] != 1)].shape
......@@ -185,14 +219,6 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
_log('Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1')
# TODO: check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B)
# TODO: check foreach BEGINTIME if foreach RADIOLINKID two rows with different LOCATATIONS are present
# TODO: check if (RADIOLINKID - LOCATION_A - LOCATION_B) triple equals (LINKID - SITEID_A - SITEID_B) triple
# should we add checks (a preparation pipeline) regarding site and location data?
# SYSTEMNAME, IFNAME and BEGINTIME are identifiers for a unique link to a given timestamp (passed data quality assessment)
# --> see Q&A field descriptions Oliver Eigner (05 Feb 2021)
_log('\n******************************** BUILD LINK DF *****************************')
# copy transmissions dataframe to link dataframe
......@@ -246,21 +272,29 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
# rename and reorder columns to aid RAINLINK format
name_cols = {
'FREQUENCY' : 'Frequency',
'LINKID' : 'ID',
'BEGINTIME' : 'DateTime',
'PMIN' : 'Pmin',
'PMAX' : 'Pmax',
'REMOTERXLEVEL' : 'Pmean',
'TXLEVEL' : 'TxLevel',
'REMOTERXLEVEL' : 'RemoteRxLevel',
'MEANLINKDIFFLEVEL' : 'MeanLinkDiffLevel',
'DIFFLEVEL' : 'DiffLevel',
'LENGTH' : 'PathLength',
'LONGITUDE_A' : 'XStart',
'LATITUDE_A' : 'YStart',
'LONGITUDE_B' : 'XEnd',
'LATITUDE_B' : 'YEnd',
'LINKID' : 'ID'
'LENGTH' : 'PathLength',
'FREQUENCY' : 'Frequency',
}
# check if RXFREQUENCY and TXFREQUENCY exists (only present with 2021-05)
if 'RXFREQUENCY' in df_link.columns and 'TXFREQUENCY' in df_link.columns:
name_cols.update({
'RXFREQUENCY' : 'RxFrequency',
'TXFREQUENCY' : 'TxFrequency'
})
df_link = df_link.rename(columns = name_cols).reindex(columns = list(name_cols.values()))
_log('Converted link dataframe to RAINLINK format')
......@@ -272,7 +306,7 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
dest_trans = file_trans.with_name(f'{file_trans.stem}_clean{file_trans.suffix}')
# build path for clean link destination file (same folder, date and extension as transmissions file)
date = str(file_trans.stem)[-10:]
date = str(file_trans.stem).split('_')[-1]
dest_link = pathlib.Path(dest_trans.parents[0], f'LINK_{date}_clean{file_trans.suffix}')
......@@ -286,17 +320,28 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
if __name__ == '__main__':
# flag for data prep
start_prep = True
# get config and transmissions file from arguments
args = parse_arguments()
try:
# convert config and transmissions arguments to paths
file_config = pathlib.Path(args.config)
file_trans = pathlib.Path(args.transmissions)
except:
# print error and exit with code 2 (command line syntax error) if paths are invalid
print('Invalid path for config and/or transmissions file!')
# convert config and transmissions arguments to paths
file_config = pathlib.Path(args.config)
file_trans = pathlib.Path(args.transmissions)
# check if config files exists
if not file_config.exists():
_log('Invalid path for config file!')
start_prep = False
# check if transmissions file exists
if not file_trans.exists():
_log('% Invalid path for transmissions file!')
start_prep = False
# start prep if flag is True, otherwise exit with code 2
if start_prep:
prep(file_config, file_trans)
else:
sys.exit(2)
# start data preparation
prep(file_config, file_trans)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment