Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
oeigner
LINK - Forschungsprojekt Repo
Commits
4984ea71
Commit
4984ea71
authored
Jun 15, 2021
by
Fabian Kovac
Browse files
[b] moved updated prep.py to fhstp folder
parent
da15bf9c
Changes
2
Hide whitespace changes
Inline
Side-by-side
FHSTP/prep.py
View file @
4984ea71
...
...
@@ -142,15 +142,15 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
_log
(
'Removed all links in config where no transmission is present'
)
# delete duplicates in config (same values, different link ids), where corresponding link id
'
s are not used in transmissions
# delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
# gather duplicated rows in config file
col_subset
=
[
'LINKTYPE'
,
'SITEID_A'
,
'LATITUDE_A'
,
'LONGITUDE_A'
,
'SITEID_B'
,
'LATITUDE_B'
,
'LONGITUDE_B'
,
'CAPACITYINTERFACE'
,
'FREQUENCY'
]
duplicated_config_ids
=
df_config
[
df_config
.
duplicated
(
subset
=
col_subset
)][
'LINKID'
].
unique
().
tolist
()
# gather duplicated link id
'
s of config file in transmissions file
# gather duplicated link ids of config file in transmissions file
found_trans_ids
=
df_trans
[
df_trans
[
'RADIOLINKID'
].
isin
(
duplicated_config_ids
)][
'RADIOLINKID'
].
unique
().
tolist
()
# calculate unused duplicated id
'
s in config file
# calculate unused duplicated ids in config file
duplicated_unused_ids
=
set
(
duplicated_config_ids
)
-
set
(
found_trans_ids
)
# delete rows with unused duplicated link ids in config file
...
...
@@ -178,6 +178,9 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
# TODO: check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B)
# TODO: check foreach BEGINTIME if foreach RADIOLINKID two rows with different LOCATATIONS are present
# TODO: check if (RADIOLINKID - LOCATION_A - LOCATION_B) triple equals (LINKID - SITEID_A - SITEID_B) triple
# should we add checks (a preparation pipeline) regarding site and location data?
# SYSTEMNAME, IFNAME and BEGINTIME are identifiers for a unique link to a given timestamp (passed data quality assessment)
# --> see Q&A field descriptions Oliver Eigner (05 Feb 2021)
_log
(
'
\n
******************************** BUILD LINK DF *****************************'
)
...
...
@@ -187,7 +190,7 @@ def prep(file_config: pathlib.Path, file_trans: pathlib.Path) -> None:
_log
(
'Copy transmissions dataframe to link dataframe'
)
# convert begintime to utc
and remove seconds
# convert begintime to utc
df_link
[
'BEGINTIME'
]
=
pd
.
to_datetime
(
df_link
[
'BEGINTIME'
],
format
=
'%Y-%m-%d %H:%M:%S'
)
df_link
[
'BEGINTIME'
]
=
df_link
[
'BEGINTIME'
].
dt
.
tz_localize
(
'Europe/Vienna'
).
dt
.
tz_convert
(
'UTC'
).
dt
.
tz_localize
(
None
)
_log
(
'Converted BEGINTIME to UTC'
)
...
...
prep.py
deleted
100644 → 0
View file @
da15bf9c
#
# Title: Data Preparation for LINK Configs and Transmissions
# Author: Fabian Kovac <ds191008@fhstp.ac.at>
# Team: University of Applied Sciences St. Pölten
# Version: 1.0
# Last changed: 2021-06-15
#
import
sys
import
pathlib
import
argparse
import
numpy
as
np
import
pandas
as
pd
def
parse_arguments
()
->
argparse
.
Namespace
:
"""Parses provided commandline arguments for LINK config file and transmissions
Returns:
args (argparse.Namespace): object with paths to provided config- and transmissions-file
"""
# create argument parser with description
desc
=
'# Data Preparation for LINK Configs and Transmissions
\n
'
desc
+=
'-'
*
64
+
'
\n
'
desc
+=
'Script outputs the same files with a "_clean" suffix.
\n
'
desc
+=
'Existing clean versions are automatically overwritten!'
parser
=
argparse
.
ArgumentParser
(
prog
=
'prep.py'
,
usage
=
'python %(prog)s -c <config_file> -t <transmissions_file>'
,
description
=
desc
,
formatter_class
=
argparse
.
RawTextHelpFormatter
)
# add required argument group
# add config parameter to parser
# add transmissions parameter to parser
required_args
=
parser
.
add_argument_group
(
'required arguments'
)
required_args
.
add_argument
(
'-c'
,
'--config'
,
type
=
str
,
required
=
True
,
help
=
'Path to Config-File'
)
required_args
.
add_argument
(
'-t'
,
'--transmissions'
,
type
=
str
,
required
=
True
,
help
=
'Path to Config-File'
)
# parse arguments
args
=
parser
.
parse_args
()
return
args
def
_log
(
msg
:
str
)
->
None
:
"""Logs messages if verbose flag is set to True
Parameters:
msg (str): Message to log to console
verbose (bool): Outputs message if set to True
"""
# add marker to log message
marker
=
'%'
if
msg
[:
1
]
==
'
\n
'
:
msg
=
f
'
\n
{
marker
}
{
msg
[
1
:
]
}
'
else
:
msg
=
f
'
{
marker
}
{
msg
}
'
# print message
print
(
msg
)
def
get_distance
(
lat_a
:
pd
.
Series
,
lon_a
:
pd
.
Series
,
lat_b
:
pd
.
Series
,
lon_b
:
pd
.
Series
)
->
np
.
array
:
"""Calculcates distance between two coordinates in km using the haversine function
Parameters:
lat_a (pd.Series): Latitudes of point A
lon_a (pd.Series): Longitudes of point A
lat_b (pd.Series): Latitudes of point B
lon_b (pd.Series): Longitudes of point B
Returns:
km (np.array): Vector with distances in km (can directly be assigned a pandas column)
"""
# convert latitudes and longitudes to radians
lat_a
,
lon_a
=
np
.
radians
(
lat_a
),
np
.
radians
(
lon_a
)
lat_b
,
lon_b
=
np
.
radians
(
lat_b
),
np
.
radians
(
lon_b
)
# calculate differences of latitudes and longitudes
diff_lat
=
lat_a
-
lat_b
diff_lon
=
lon_a
-
lon_b
# calculate distances in km between point a and point b
a
=
np
.
sin
(
diff_lat
/
2
)
**
2
+
np
.
cos
(
lat_a
)
*
np
.
cos
(
lat_b
)
*
np
.
sin
(
diff_lon
/
2
)
**
2
c
=
2
*
np
.
arctan2
(
np
.
sqrt
(
a
),
np
.
sqrt
(
1
-
a
))
km
=
6371
*
c
return
km
def
prep
(
file_config
:
pathlib
.
Path
,
file_trans
:
pathlib
.
Path
)
->
None
:
"""Data preparation for LINK config and transmissions
Parameters:
file_config (pathlib.Path): Config File
file_trans (pathlib.Path): Transmissions File
"""
_log
(
'
\n
******************************** READ FILES ********************************'
)
# read files
df_config
=
pd
.
read_csv
(
file_config
,
sep
=
';'
)
_log
(
f
'Read config file with shape
{
df_config
.
shape
}
'
)
df_trans
=
pd
.
read_csv
(
file_trans
,
sep
=
';'
)
_log
(
f
'Read transmissions file with shape
{
df_trans
.
shape
}
'
)
_log
(
'
\n
******************************** BASIC PREP ********************************'
)
# remove test-link with link id 1
df_config
=
df_config
[
df_config
[
'LINKID'
]
!=
1
]
df_trans
=
df_trans
[
df_trans
[
'RADIOLINKID'
]
!=
1
]
_log
(
'Removed all entries of test-link with linkid 1'
)
# drop links that are officially not in use ('na' in CAPACITYINTERFACE and/or FREQUENCY)
# --> see Q&A Phillip Scheffknecht (05 Feb 2021)
df_config
=
df_config
.
dropna
(
axis
=
0
,
subset
=
[
'CAPACITYINTERFACE'
,
'FREQUENCY'
])
_log
(
'Dropped configs with NA in CAPACITYINTERFACE and/or FREQUENCY (links officially not in use)'
)
# delete rows with unused link ids
# get link ids of config and transmissions
config_ids
=
df_config
[
'LINKID'
].
unique
().
tolist
()
trans_ids
=
df_trans
[
'RADIOLINKID'
].
unique
().
tolist
()
# delete link ids in transmissions without config
unused_trans_ids
=
set
(
trans_ids
)
-
set
(
config_ids
)
df_trans
=
df_trans
[
~
df_trans
[
'RADIOLINKID'
].
isin
(
list
(
unused_trans_ids
))]
_log
(
'Removed all links in transmissions where no config is present'
)
# delete link ids in config without transmissions
unused_config_ids
=
set
(
config_ids
)
-
set
(
trans_ids
)
df_config
=
df_config
[
~
df_config
[
'LINKID'
].
isin
(
list
(
unused_config_ids
))]
_log
(
'Removed all links in config where no transmission is present'
)
# delete duplicates in config (same values, different link ids), where corresponding link ids are not used in transmissions
# gather duplicated rows in config file
col_subset
=
[
'LINKTYPE'
,
'SITEID_A'
,
'LATITUDE_A'
,
'LONGITUDE_A'
,
'SITEID_B'
,
'LATITUDE_B'
,
'LONGITUDE_B'
,
'CAPACITYINTERFACE'
,
'FREQUENCY'
]
duplicated_config_ids
=
df_config
[
df_config
.
duplicated
(
subset
=
col_subset
)][
'LINKID'
].
unique
().
tolist
()
# gather duplicated link ids of config file in transmissions file
found_trans_ids
=
df_trans
[
df_trans
[
'RADIOLINKID'
].
isin
(
duplicated_config_ids
)][
'RADIOLINKID'
].
unique
().
tolist
()
# calculate unused duplicated ids in config file
duplicated_unused_ids
=
set
(
duplicated_config_ids
)
-
set
(
found_trans_ids
)
# delete rows with unused duplicated link ids in config file
df_config
=
df_config
[
~
df_config
[
'LINKID'
].
isin
(
list
(
duplicated_unused_ids
))]
_log
(
'Removed duplicated links which are not in use'
)
# calculate LENGTH in km between links using the haversine function
df_config
[
'LENGTH'
]
=
get_distance
(
df_config
[
'LATITUDE_A'
],
df_config
[
'LONGITUDE_A'
],
df_config
[
'LATITUDE_B'
],
df_config
[
'LONGITUDE_B'
])
_log
(
'Calculate distance between sites using the haversine function'
)
# convert FREQUENCY to float
df_config
[
'FREQUENCY'
]
=
df_config
[
'FREQUENCY'
].
map
(
lambda
x
:
str
(
x
)[:
-
3
]).
astype
(
'float'
)
_log
(
'Converted FREQUENCY to float'
)
# TODO: drop transmissions with (operational) status unequal 1?
# check occurences with: df_trans[(df_trans['STATUS'] != 1) | (df_trans['OPERATIONALSTATUS'] != 1)].shape
df_trans
=
df_trans
[
df_trans
[
'STATUS'
]
==
1
]
df_trans
=
df_trans
[
df_trans
[
'OPERATIONALSTATUS'
]
==
1
]
_log
(
'Removed transmissions with STATUS and/or OPERATIONALSTATUS unequal 1'
)
# TODO: check (RADIOLINKID - LOCATION) pairs with (LINKID - SITEID_A) or (LINKID - SITEID_B)
# TODO: check foreach BEGINTIME if foreach RADIOLINKID two rows with different LOCATATIONS are present
# TODO: check if (RADIOLINKID - LOCATION_A - LOCATION_B) triple equals (LINKID - SITEID_A - SITEID_B) triple
# should we add checks (a preparation pipeline) regarding site and location data?
# SYSTEMNAME, IFNAME and BEGINTIME are identifiers for a unique link to a given timestamp (passed data quality assessment)
# --> see Q&A field descriptions Oliver Eigner (05 Feb 2021)
_log
(
'
\n
******************************** BUILD LINK DF *****************************'
)
# copy transmissions dataframe to link dataframe
df_link
=
df_trans
.
copy
()
_log
(
'Copy transmissions dataframe to link dataframe'
)
# convert begintime to utc
df_link
[
'BEGINTIME'
]
=
pd
.
to_datetime
(
df_link
[
'BEGINTIME'
],
format
=
'%Y-%m-%d %H:%M:%S'
)
df_link
[
'BEGINTIME'
]
=
df_link
[
'BEGINTIME'
].
dt
.
tz_localize
(
'Europe/Vienna'
).
dt
.
tz_convert
(
'UTC'
).
dt
.
tz_localize
(
None
)
_log
(
'Converted BEGINTIME to UTC'
)
# copy REMOTERXLEVEL to PMIN and PMAX (for aggregation in 15min window conversion)
df_link
[
'PMIN'
]
=
df_link
[
'REMOTERXLEVEL'
]
df_link
[
'PMAX'
]
=
df_link
[
'REMOTERXLEVEL'
]
_log
(
'Created PMIN and PMAX of REMOTERXLEVEL'
)
# convert 3min windows to 15min windows
group_cols
=
[
df_link
[
'BEGINTIME'
].
dt
.
floor
(
'15Min'
),
'RADIOLINKID'
]
agg_cols
=
{
'TXLEVEL'
:
'mean'
,
'REMOTERXLEVEL'
:
'mean'
,
'PMIN'
:
'min'
,
'PMAX'
:
'max'
}
df_link
=
df_link
.
groupby
(
group_cols
).
agg
(
agg_cols
).
reset_index
()
_log
(
'Converted 3min windows to 15min windows'
)
# convert BEGINTIME to RAINLINK format
df_link
[
'BEGINTIME'
]
=
df_link
[
'BEGINTIME'
].
dt
.
strftime
(
'%Y%m%d%H%M'
)
_log
(
'Converted BEGINTIME to RAINLINK format "%Y%m%d%H%M"'
)
# build df with differences of sending and receiving levels
df_diff
=
df_link
[[
'RADIOLINKID'
,
'TXLEVEL'
,
'REMOTERXLEVEL'
]].
copy
()
df_diff
[
'MEANLINKDIFFLEVEL'
]
=
df_diff
[
'TXLEVEL'
]
-
df_diff
[
'REMOTERXLEVEL'
]
_log
(
'Built dataframe with mean link difference levels of TXLEVEL and REMOTERXLEVEL'
)
# get mean of differences
df_diff
=
df_diff
.
groupby
([
'RADIOLINKID'
]).
agg
({
'MEANLINKDIFFLEVEL'
:
'mean'
}).
reset_index
()
_log
(
'Merged mean link difference levels back to link dataframe'
)
# merge differences to transmission dataframe
df_link
=
pd
.
merge
(
df_link
,
df_diff
,
how
=
'inner'
,
left_on
=
'RADIOLINKID'
,
right_on
=
'RADIOLINKID'
)
df_link
[
'DIFFLEVEL'
]
=
df_link
[
'TXLEVEL'
]
-
df_link
[
'REMOTERXLEVEL'
]
-
df_link
[
'MEANLINKDIFFLEVEL'
]
_log
(
'Calculated DIFFLEVEL as TXLEVEL - REMOTERXLEVEL - MEANLINKDIFFLEVEL'
)
# merge config and link dataframe
drop_cols
=
[
'RADIOLINKID'
,
'LINKTYPE'
,
'SITEID_A'
,
'SITEID_B'
,
'CAPACITYINTERFACE'
]
df_link
=
pd
.
merge
(
df_link
,
df_config
,
how
=
'inner'
,
left_on
=
'RADIOLINKID'
,
right_on
=
'LINKID'
).
drop
(
drop_cols
,
axis
=
1
)
_log
(
'Merged config data to link dataframe'
)
# rename and reorder columns to aid RAINLINK format
name_cols
=
{
'FREQUENCY'
:
'Frequency'
,
'BEGINTIME'
:
'DateTime'
,
'PMIN'
:
'Pmin'
,
'PMAX'
:
'Pmax'
,
'TXLEVEL'
:
'TxLevel'
,
'REMOTERXLEVEL'
:
'RemoteRxLevel'
,
'MEANLINKDIFFLEVEL'
:
'MeanLinkDiffLevel'
,
'DIFFLEVEL'
:
'DiffLevel'
,
'LENGTH'
:
'PathLength'
,
'LONGITUDE_A'
:
'XStart'
,
'LATITUDE_A'
:
'YStart'
,
'LONGITUDE_B'
:
'XEnd'
,
'LATITUDE_B'
:
'YEnd'
,
'LINKID'
:
'ID'
}
df_link
=
df_link
.
rename
(
columns
=
name_cols
).
reindex
(
columns
=
list
(
name_cols
.
values
()))
_log
(
'Converted link dataframe to RAINLINK format'
)
_log
(
'
\n
******************************** SAVE FILES ********************************'
)
# build path for clean config and transmissions destination files
dest_config
=
file_config
.
with_name
(
f
'
{
file_config
.
stem
}
_clean
{
file_config
.
suffix
}
'
)
dest_trans
=
file_trans
.
with_name
(
f
'
{
file_trans
.
stem
}
_clean
{
file_trans
.
suffix
}
'
)
# build path for clean link destination file (same folder, date and extension as transmissions file)
date
=
str
(
file_trans
.
stem
)[
-
10
:]
dest_link
=
pathlib
.
Path
(
dest_trans
.
parents
[
0
],
f
'LINK_
{
date
}
_clean
{
file_trans
.
suffix
}
'
)
# save cleaned files
df_config
.
to_csv
(
dest_config
,
sep
=
';'
,
header
=
True
,
index
=
False
)
_log
(
f
'Saved clean config file with shape
{
df_config
.
shape
}
to "
{
str
(
dest_config
)
}
"'
)
df_trans
.
to_csv
(
dest_trans
,
sep
=
';'
,
header
=
True
,
index
=
False
)
_log
(
f
'Saved clean transmissions file with shape
{
df_trans
.
shape
}
to "
{
str
(
dest_trans
)
}
"'
)
df_link
.
to_csv
(
dest_link
,
sep
=
';'
,
header
=
True
,
index
=
False
)
_log
(
f
'Saved clean link file with shape
{
df_link
.
shape
}
to "
{
str
(
dest_link
)
}
"'
)
if
__name__
==
'__main__'
:
# get config and transmissions file from arguments
args
=
parse_arguments
()
try
:
# convert config and transmissions arguments to paths
file_config
=
pathlib
.
Path
(
args
.
config
)
file_trans
=
pathlib
.
Path
(
args
.
transmissions
)
except
:
# print error and exit with code 2 (command line syntax error) if paths are invalid
print
(
'Invalid path for config and/or transmissions file!'
)
sys
.
exit
(
2
)
# start data preparation
prep
(
file_config
,
file_trans
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment