"""
===============================================================================
Module featuring classes and functionality for unifying events collections.
Classes
-------
EventsUnion
Dependencies
------------
pandas, numpy, copy, warnings, functools
Development
-----------
Developed by:
Tariq Shihadah, tariq.shihadah@gmail.com
Created:
4/13/2022
Modified:
4/13/2022
===============================================================================
"""
################
# DEPENDENCIES #
################
import pandas as pd
import numpy as np
import copy, warnings
from functools import wraps
from rangel import RangeCollection
[docs]class EventsUnion(object):
"""
Parameters
----------
objs : list-like of EventsCollection instances
A selection of EventsCollection object instances to be combined
into a single instance based on the input parameters.
**kwargs
Keyword arguments to be passed to the initialization function for
the new EventsCollection instance.
"""
def __init__(self, objs, **kwargs):
self.objs = objs
@property
def objs(self):
return self._objs
@objs.setter
def objs(self, objs):
# Validate input object types
if check_compatibility(objs):
self._objs = objs
@property
def num_objs(self):
return len(self.objs)
@property
def group_keys_unique(self):
return list(set(
key for obj in self.objs for key in obj.group_keys_unique))
@property
def num_keys(self):
return self.objs[0].num_keys
[docs] def get_groups(self, keys, empty=True):
"""
Retrieve unique groups of events from each related collection based on
provided key values.
Parameters
----------
keys : key value, tuple of key values, or list of the same
If only one key column is defined within the collections, a single
column value may be provided. Otherwise, a tuple of column values
must be provided in the same order as they appear in self.keys. To
get multiple groups, a list of key values or tuples may be
provided.
empty : bool, default True
Whether to allow for empty events groups to be returned when the
provided keys are valid but are not associated with any actual
events. If False, these cases will return a KeyError.
"""
# Retrieve groups from all collections
groups = [obj.get_group(keys, empty=empty) for obj in self._objs]
return groups
[docs] def union(
self,
fill_gaps=False,
get_index=True,
merge=False,
suffixes=None,
**kwargs
):
"""
Combine multiple EventsCollection instances into a single instance,
creating least common intervals among all collections and maintaining
all event attributes. The resulting combined events will be used to
create and return an EventsCollection modeled after the first indexed
collection in self.objs.
Parameters
----------
fill_gaps : bool, default False
Whether to fill gaps in the merged collection with empty events.
These events would not be associated with any parent collection and
would not be populated with any events attributes.
get_index : bool, default True
Whether to produce columns relating each new record to the index of
the originating record in the input events dataframes. When this is
not necessary, setting to False may produce significant time
savings.
merge : bool, default False
Whether to merge columns from each original dataframe to the newly
created resegmented events collection dataframe. If not done during
the union, it can be done later by merging on the new 'index_i'
columns which correlate with the indices of the original
dataframes. To perform this merge manually, the get_index parameter
should be True.
suffixes : list-like, default ['_0', ..., '_n']
Sequence of length equal to the number of events collections being
unified, where each element is a string indicating the suffix to
add to overlapping column names in each corresponding events
dataframe. All entries must be unique.
"""
# Validate suffixes
if suffixes is None:
suffixes = [f'_{i}' for i in range(self.num_objs)]
else:
try:
assert len(suffixes) == self.num_objs
assert len(set(suffixes)) == self.num_objs
assert all(isinstance(suffix, str) for suffix in suffixes)
except:
raise ValueError(
"Input suffixes must be list-like of unique strings with "
"a length equal to the number of events collections being "
f"unified ({self.num_objs:,.0f}).")
# Initialize new linear referencing data columns
keys = []
begs = []
ends = []
indices = []
# Iterate over all unique group keys across all collections
# For collections that do not contain a given group key, the resulting
# data will be left as null
for group_key in self.group_keys_unique:
# Get each group associated with the selected key across all
# collections being analyzed
groups = self.get_groups(group_key, empty=True)
# Retrieve the range data associated with each group being unified
ranges = [group.rng for group in groups]
# Union ranges
if get_index:
rc, index = RangeCollection.union(
ranges, fill_gaps=fill_gaps, return_index=True,
null_index=-1)
# Reshape index arrays
arrs = []
for i, arr_i in enumerate(index):
try:
arr_i = np.where(
arr_i!=-1, groups[i].df.index.values[arr_i], np.nan)
except IndexError:
pass
arrs.append(arr_i)
# Concatenate selected indices
index = np.array(arrs).T
indices.append(index)
else:
rc = RangeCollection.union(
ranges, fill_gaps=fill_gaps, return_index=False,
null_index=-1)
# Log unified range results
keys.append(np.tile(group_key, (rc.num_ranges, 1)))
begs.append(rc.begs)
ends.append(rc.ends)
# Prepare resulting unified dataframe
keys = np.concatenate(keys, axis=0)
begs = np.concatenate(begs)
ends = np.concatenate(ends)
indices = np.concatenate(indices, axis=0)
if get_index:
indices[indices==-1] = np.nan
data = pd.DataFrame({
**{col: arr for col, arr in zip(self.objs[0].keys, keys.T)},
**{self.objs[0].beg: begs, self.objs[0].end: ends},
**{f'index_{i}': arr for i, arr in enumerate(indices.T)}
})
else:
data = pd.DataFrame({
**{col: arr for col, arr in zip(self.objs[0].keys, keys.T)},
**{self.objs[0].beg: begs, self.objs[0].end: ends},
})
# Merge resegmented data with original dataframe columns
if merge and get_index:
for i, obj in enumerate(self.objs):
suffixes_i = (None, suffixes[i])
data = data.merge(
obj.df.drop(columns=self.objs[0].targets, errors='ignore'),
how='left', left_on=f'index_{i}', right_index=True,
suffixes=suffixes_i, **kwargs)
# Convert to events collection in the model of the first collection
ec = self.objs[0].from_similar(data, geom=None)
return ec
#####################
# LATE DEPENDENCIES #
#####################
from linref.events.collection import EventsCollection, check_compatibility