Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
oeigner
LINK - Forschungsprojekt Repo
Commits
bad77c5b
Commit
bad77c5b
authored
Jun 17, 2021
by
Fabian Kovac
Browse files
[i] check for invalid linkid-location tuples
parent
9f4dbd8d
Changes
1
Hide whitespace changes
Inline
Side-by-side
FHSTP/prep.py
View file @
bad77c5b
...
...
@@ -138,16 +138,16 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
# delete rows with unused link ids
# get link ids of config and transmissions
config_ids
=
df_config
[
'LINKID'
].
unique
().
tolist
()
trans_ids
=
df_trans
[
'RADIOLINKID'
].
unique
().
tolist
()
config_ids
=
set
(
df_config
[
'LINKID'
].
unique
().
tolist
()
)
trans_ids
=
set
(
df_trans
[
'RADIOLINKID'
].
unique
().
tolist
()
)
# delete link ids in transmissions without config
unused_trans_ids
=
set
(
trans_ids
)
-
set
(
config_ids
)
unused_trans_ids
=
trans_ids
-
config_ids
df_trans
=
df_trans
[
~
df_trans
[
'RADIOLINKID'
].
isin
(
list
(
unused_trans_ids
))]
_log
(
'Removed all links in transmissions where no config is present'
)
# delete link ids in config without transmissions
unused_config_ids
=
set
(
config_ids
)
-
set
(
trans_ids
)
unused_config_ids
=
config_ids
-
trans_ids
df_config
=
df_config
[
~
df_config
[
'LINKID'
].
isin
(
list
(
unused_config_ids
))]
_log
(
'Removed all links in config where no transmission is present'
)
...
...
@@ -155,18 +155,43 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
# delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
# gather duplicated rows in config file
col_subset
=
[
'LINKTYPE'
,
'SITEID_A'
,
'LATITUDE_A'
,
'LONGITUDE_A'
,
'SITEID_B'
,
'LATITUDE_B'
,
'LONGITUDE_B'
,
'CAPACITYINTERFACE'
,
'FREQUENCY'
]
duplicated_config_ids
=
df_config
[
df_config
.
duplicated
(
subset
=
col_subset
)][
'LINKID'
].
unique
().
tolist
()
duplicated_config_ids
=
set
(
df_config
[
df_config
.
duplicated
(
subset
=
col_subset
)][
'LINKID'
].
unique
().
tolist
()
)
# gather duplicated link ids of config file in transmissions file
found_trans_ids
=
df_trans
[
df_trans
[
'RADIOLINKID'
].
isin
(
duplicated_config_ids
)][
'RADIOLINKID'
].
unique
().
tolist
()
found_trans_ids
=
set
(
df_trans
[
df_trans
[
'RADIOLINKID'
].
isin
(
duplicated_config_ids
)][
'RADIOLINKID'
].
unique
().
tolist
()
)
# calculate unused duplicated ids in config file
duplicated_
un
used_ids
=
set
(
duplicated_config_ids
)
-
set
(
found_trans_ids
)
duplicated_used_ids
=
duplicated_config_ids
-
found_trans_ids
# delete rows with unused duplicated link ids in config file
df_config
=
df_config
[
~
df_config
[
'LINKID'
].
isin
(
list
(
duplicated_
un
used_ids
))]
df_config
=
df_config
[
~
df_config
[
'LINKID'
].
isin
(
list
(
duplicated_used_ids
))]
_log
(
'Removed duplicated links which are not in use'
)
# check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B) pairs
# gather unique combinations of link ids and site ids of config
# (new temporary column is much faster then pandas agg function)
df_config
[
'TEMP_LOC_TUPLE'
]
=
df_config
[
'LINKID'
].
astype
(
str
)
+
'--'
+
df_config
[
'SITEID_A'
]
config_loc_tuples
=
set
(
df_config
[
'TEMP_LOC_TUPLE'
].
unique
().
tolist
())
df_config
[
'TEMP_LOC_TUPLE'
]
=
df_config
[
'LINKID'
].
astype
(
str
)
+
'--'
+
df_config
[
'SITEID_B'
]
config_loc_tuples
.
update
(
set
(
df_config
[
'TEMP_LOC_TUPLE'
].
unique
().
tolist
()))
# gather unique combinations of link ids and locations in transmissions
# (new temporary column is much faster then pandas agg function)
df_trans
[
'TEMP_LOC_TUPLE'
]
=
df_trans
[
'RADIOLINKID'
].
astype
(
str
)
+
'--'
+
df_trans
[
'LOCATION'
]
trans_loc_tuples
=
set
(
df_trans
[
'TEMP_LOC_TUPLE'
].
unique
().
tolist
())
# calculate link id - location tuples which are not in config
invalid_loc_tuples
=
trans_loc_tuples
-
config_loc_tuples
# remove invalid tuples from transmissions
df_trans
=
df_trans
[
~
df_trans
[
'TEMP_LOC_TUPLE'
].
isin
(
list
(
invalid_loc_tuples
))]
_log
(
'Removed all transmissions with invalid RADIOLINKID - LOCATION tuples (not present in config)'
)
# remove temp columns
df_config
=
df_config
.
drop
([
'TEMP_LOC_TUPLE'
],
axis
=
1
)
df_trans
=
df_trans
.
drop
([
'TEMP_LOC_TUPLE'
],
axis
=
1
)
# calculate LENGTH in km between links
df_config
[
'LENGTH'
]
=
get_distance
(
df_config
[
'LATITUDE_A'
],
df_config
[
'LONGITUDE_A'
],
df_config
[
'LATITUDE_B'
],
df_config
[
'LONGITUDE_B'
])
...
...
@@ -177,6 +202,15 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
df_config
[
'FREQUENCY'
]
=
df_config
[
'FREQUENCY'
].
map
(
lambda
x
:
str
(
x
)[:
-
3
]).
astype
(
'float'
)
_log
(
'Converted FREQUENCY to float'
)
# convert RXFREQUENCY and TXFREQUENCY to float and from MHz to GHz
# check if columns exists (only present with 2021-05)
if
'RXFREQUENCY'
in
df_config
.
columns
and
'TXFREQUENCY'
in
df_config
.
columns
:
df_config
[
'RXFREQUENCY'
]
=
df_config
[
'RXFREQUENCY'
].
astype
(
'float'
)
df_config
[
'RXFREQUENCY'
]
=
df_config
[
'RXFREQUENCY'
]
/
1000
df_config
[
'TXFREQUENCY'
]
=
df_config
[
'TXFREQUENCY'
].
astype
(
'float'
)
df_config
[
'TXFREQUENCY'
]
=
df_config
[
'TXFREQUENCY'
]
/
1000
_log
(
'Converted RXFREQUENCY and TXFREQUENCY to float and GHz'
)
# TODO: drop transmissions with (operational) status unequal 1?
# check occurences with: df_trans[(df_trans['STATUS'] != 1) | (df_trans['OPERATIONALSTATUS'] != 1)].shape
...
...
@@ -185,14 +219,6 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
_log
(
'Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1'
)
# TODO: check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B)
# TODO: check foreach BEGINTIME if foreach RADIOLINKID two rows with different LOCATATIONS are present
# TODO: check if (RADIOLINKID - LOCATION_A - LOCATION_B) triple equals (LINKID - SITEID_A - SITEID_B) triple
# should we add checks (a preparation pipeline) regarding site and location data?
# SYSTEMNAME, IFNAME and BEGINTIME are identifiers for a unique link to a given timestamp (passed data quality assessment)
# --> see Q&A field descriptions Oliver Eigner (05 Feb 2021)
_log
(
'
\n
******************************** BUILD LINK DF *****************************'
)
# copy transmissions dataframe to link dataframe
...
...
@@ -246,21 +272,29 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
# rename and reorder columns to aid RAINLINK format
name_cols
=
{
'
FREQUENCY'
:
'Frequency
'
,
'
LINKID'
:
'ID
'
,
'BEGINTIME'
:
'DateTime'
,
'PMIN'
:
'Pmin'
,
'PMAX'
:
'Pmax'
,
'REMOTERXLEVEL'
:
'Pmean'
,
'TXLEVEL'
:
'TxLevel'
,
'REMOTERXLEVEL'
:
'RemoteRxLevel'
,
'MEANLINKDIFFLEVEL'
:
'MeanLinkDiffLevel'
,
'DIFFLEVEL'
:
'DiffLevel'
,
'LENGTH'
:
'PathLength'
,
'LONGITUDE_A'
:
'XStart'
,
'LATITUDE_A'
:
'YStart'
,
'LONGITUDE_B'
:
'XEnd'
,
'LATITUDE_B'
:
'YEnd'
,
'LINKID'
:
'ID'
'LENGTH'
:
'PathLength'
,
'FREQUENCY'
:
'Frequency'
,
}
# check if RXFREQUENCY and TXFREQUENCY exists (only present with 2021-05)
if
'RXFREQUENCY'
in
df_link
.
columns
and
'TXFREQUENCY'
in
df_link
.
columns
:
name_cols
.
update
({
'RXFREQUENCY'
:
'RxFrequency'
,
'TXFREQUENCY'
:
'TxFrequency'
})
df_link
=
df_link
.
rename
(
columns
=
name_cols
).
reindex
(
columns
=
list
(
name_cols
.
values
()))
_log
(
'Converted link dataframe to RAINLINK format'
)
...
...
@@ -272,7 +306,7 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
dest_trans
=
file_trans
.
with_name
(
f
'
{
file_trans
.
stem
}
_clean
{
file_trans
.
suffix
}
'
)
# build path for clean link destination file (same folder, date and extension as transmissions file)
date
=
str
(
file_trans
.
stem
)[
-
1
0
:
]
date
=
str
(
file_trans
.
stem
)
.
split
(
'_'
)
[
-
1
]
dest_link
=
pathlib
.
Path
(
dest_trans
.
parents
[
0
],
f
'LINK_
{
date
}
_clean
{
file_trans
.
suffix
}
'
)
...
...
@@ -286,17 +320,28 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
if
__name__
==
'__main__'
:
# flag for data prep
start_prep
=
True
# get config and transmissions file from arguments
args
=
parse_arguments
()
try
:
# convert config and transmissions arguments to paths
file_config
=
pathlib
.
Path
(
args
.
config
)
file_trans
=
pathlib
.
Path
(
args
.
transmissions
)
except
:
# print error and exit with code 2 (command line syntax error) if paths are invalid
print
(
'Invalid path for config and/or transmissions file!'
)
# convert config and transmissions arguments to paths
file_config
=
pathlib
.
Path
(
args
.
config
)
file_trans
=
pathlib
.
Path
(
args
.
transmissions
)
# check if config files exists
if
not
file_config
.
exists
():
_log
(
'Invalid path for config file!'
)
start_prep
=
False
# check if transmissions file exists
if
not
file_trans
.
exists
():
_log
(
'% Invalid path for transmissions file!'
)
start_prep
=
False
# start prep if flag is True, otherwise exit with code 2
if
start_prep
:
prep
(
file_config
,
file_trans
)
else
:
sys
.
exit
(
2
)
# start data preparation
prep
(
file_config
,
file_trans
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment