"""
DATA:
-----------------------------
| | cid |
-----------------------------
| | |
|r | |
|i | data |
|d | |
| | |
-----------------------------
ROW METADATA:
--------------------------
|id| rhd |
--------------------------
| | |
|r | |
|i | row_metadata |
|d | |
| | |
--------------------------
COLUMN METADATA:
N.B. The df is transposed from how it looks in a gct file.
---------------------
|id| chd |
---------------------
| | |
| | |
| | |
|c | |
|i | col_metadata |
|d | |
| | |
| | |
| | |
---------------------
N.B. rids, cids, rhds, and chds must be:
- unique
- matching in both content & order everywhere they're found
"""
import numpy as np
import pandas as pd
import logging
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
__authors__ = 'Oana Enache, Lev Litichevskiy, Dave Lahr'
__email__ = 'dlahr@broadinstitute.org'
[docs]class GCToo(object):
"""Class representing parsed gct(x) objects as pandas dataframes.
Contains 3 component dataframes (row_metadata_df, column_metadata_df,
and data_df) as well as an assembly of these 3 into a multi index df
that provides an alternate way of selecting data.
"""
def __init__(self, data_df, row_metadata_df=None, col_metadata_df=None,
src=None, version=None, make_multiindex=False, logger_name=setup_logger.LOGGER_NAME):
self.logger = logging.getLogger(logger_name)
self.src = src
self.version = version
# Check data_df before setting
self.check_df(data_df)
self.data_df = data_df
if row_metadata_df is None:
self.row_metadata_df = pd.DataFrame(index=data_df.index)
else:
# Lots of checks will occur when this attribute is set (see __setattr__ below)
self.row_metadata_df = row_metadata_df
if col_metadata_df is None:
self.col_metadata_df = pd.DataFrame(index=data_df.columns)
else:
# Lots of checks will occur when this attribute is set (see __setattr__ below)
self.col_metadata_df = col_metadata_df
# Create multi_index_df if explicitly requested
if make_multiindex:
self.assemble_multi_index_df()
else:
self.multi_index_df = None
# This GCToo object is now initialized
self._initialized = True
def __setattr__(self, name, value):
# Make sure row/col metadata agree with data_df before setting
if name in ["row_metadata_df", "col_metadata_df"]:
self.check_df(value)
if name == "row_metadata_df":
self.id_match_check(self.data_df, value, "row")
value = value.reindex(self.data_df.index)
super(GCToo, self).__setattr__(name, value)
else:
self.id_match_check(self.data_df, value, "col")
value = value.reindex(self.data_df.columns)
super(GCToo, self).__setattr__(name, value)
# When reassigning data_df after initialization, reindex row/col metadata if necessary
# N.B. Need to check if _initialized is present before checking if it's true, or code will break
elif name == "data_df" and "_initialized" in self.__dict__ and self._initialized:
self.id_match_check(value, self.row_metadata_df, "row")
self.id_match_check(value, self.col_metadata_df, "col")
super(GCToo, self).__setattr__("row_metadata_df", self.row_metadata_df.reindex(value.index))
super(GCToo, self).__setattr__("col_metadata_df", self.col_metadata_df.reindex(value.columns))
super(GCToo, self).__setattr__(name, value)
# Can't reassign multi_index_df after initialization
elif name == "multi_index_df" and "_initialized" in self.__dict__ and self._initialized:
msg = ("Cannot reassign value of multi_index_df attribute; " +
"if you'd like a new multiindex df, please create a new GCToo instance" +
"with appropriate data_df, row_metadata_df, and col_metadata_df fields.")
self.logger.error(msg)
raise Exception("GCToo.__setattr__: " + msg)
# Otherwise, use the normal __setattr__ method
else:
super(GCToo, self).__setattr__(name, value)
def check_df(self, df):
"""
Verifies that df is a pandas DataFrame instance and
that its index and column values are unique.
"""
if isinstance(df, pd.DataFrame):
if not df.index.is_unique:
repeats = df.index[df.index.duplicated()].values
msg = "Index values must be unique but aren't. The following entries appear more than once: {}".format(repeats)
self.logger.error(msg)
raise Exception("GCToo GCToo.check_df " + msg)
if not df.columns.is_unique:
repeats = df.columns[df.columns.duplicated()].values
msg = "Columns values must be unique but aren't. The following entries appear more than once: {}".format(repeats)
raise Exception("GCToo GCToo.check_df " + msg)
else:
return True
else:
msg = "expected Pandas DataFrame, got something else: {} of type: {}".format(df, type(df))
self.logger.error(msg)
raise Exception("GCToo GCToo.check_df " + msg)
def id_match_check(self, data_df, meta_df, dim):
"""
Verifies that id values match between:
- row case: index of data_df & index of row metadata
- col case: columns of data_df & index of column metadata
"""
if dim == "row":
if len(data_df.index) == len(meta_df.index) and set(data_df.index) == set(meta_df.index):
return True
else:
msg = ("The rids are inconsistent between data_df and row_metadata_df.\n" +
"data_df.index.values:\n{}\nrow_metadata_df.index.values:\n{}").format(data_df.index.values, meta_df.index.values)
self.logger.error(msg)
raise Exception("GCToo GCToo.id_match_check " + msg)
elif dim == "col":
if len(data_df.columns) == len(meta_df.index) and set(data_df.columns) == set(meta_df.index):
return True
else:
msg = ("The cids are inconsistent between data_df and col_metadata_df.\n" +
"data_df.columns.values:\n{}\ncol_metadata_df.index.values:\n{}").format(data_df.columns.values, meta_df.index.values)
self.logger.error(msg)
raise Exception("GCToo GCToo.id_match_check " + msg)
def __str__(self):
"""Prints a string representation of a GCToo object."""
version = "{}\n".format(self.version)
source = "src: {}\n".format(self.src)
data = "data_df: [{} rows x {} columns]\n".format(
self.data_df.shape[0], self.data_df.shape[1])
row_meta = "row_metadata_df: [{} rows x {} columns]\n".format(
self.row_metadata_df.shape[0], self.row_metadata_df.shape[1])
col_meta = "col_metadata_df: [{} rows x {} columns]".format(
self.col_metadata_df.shape[0], self.col_metadata_df.shape[1])
full_string = (version + source + data + row_meta + col_meta)
return full_string
def assemble_multi_index_df(self):
"""Assembles three component dataframes into a multiindex dataframe.
Sets the result to self.multi_index_df.
IMPORTANT: Cross-section ("xs") is the best command for selecting
data. Be sure to use the flag "drop_level=False" with this command,
or else the dataframe that is returned will not have the same
metadata as the input.
N.B. "level" means metadata header.
N.B. "axis=1" indicates column annotations.
Examples:
1) Select the probe with pr_lua_id="LUA-3404":
lua3404_df = multi_index_df.xs("LUA-3404", level="pr_lua_id", drop_level=False)
2) Select all DMSO samples:
DMSO_df = multi_index_df.xs("DMSO", level="pert_iname", axis=1, drop_level=False)
"""
#prepare row index
self.logger.debug("Row metadata shape: {}".format(self.row_metadata_df.shape))
self.logger.debug("Is empty? {}".format(self.row_metadata_df.empty))
row_copy = pd.DataFrame(self.row_metadata_df.index) if self.row_metadata_df.empty else self.row_metadata_df.copy()
row_copy["rid"] = row_copy.index
row_index = pd.MultiIndex.from_arrays(row_copy.T.values, names=row_copy.columns)
#prepare column index
self.logger.debug("Col metadata shape: {}".format(self.col_metadata_df.shape))
col_copy = pd.DataFrame(self.col_metadata_df.index) if self.col_metadata_df.empty else self.col_metadata_df.copy()
col_copy["cid"] = col_copy.index
transposed_col_metadata = col_copy.T
col_index = pd.MultiIndex.from_arrays(transposed_col_metadata.values, names=transposed_col_metadata.index)
# Create multi index dataframe using the values of data_df and the indexes created above
self.logger.debug("Data df shape: {}".format(self.data_df.shape))
self.multi_index_df = pd.DataFrame(data=self.data_df.values, index=row_index, columns=col_index)
def multi_index_df_to_component_dfs(multi_index_df, rid="rid", cid="cid"):
""" Convert a multi-index df into 3 component dfs. """
# Id level of the multiindex will become the index
rids = list(multi_index_df.index.get_level_values(rid))
cids = list(multi_index_df.columns.get_level_values(cid))
# It's possible that the index and/or columns of multi_index_df are not
# actually multi-index; need to check for this and there are more than one level in index(python3)
if isinstance(multi_index_df.index, pd.MultiIndex):
# check if there are more than one levels in index (python3)
if len(multi_index_df.index.names) > 1:
# If so, drop rid because it won't go into the body of the metadata
mi_df_index = multi_index_df.index.droplevel(rid)
# Names of the multiindex levels become the headers
rhds = list(mi_df_index.names)
# Assemble metadata values
row_metadata = np.array([mi_df_index.get_level_values(level).values for level in list(rhds)]).T
# if there is one level in index (python3), then rhds and row metadata should be empty
else:
rhds = []
row_metadata = []
# If the index is not multi-index, then rhds and row metadata should be empty
else:
rhds = []
row_metadata = []
# Check if columns of multi_index_df are in fact multi-index
if isinstance(multi_index_df.columns, pd.MultiIndex):
# Check if there are more than one levels in columns(python3)
if len(multi_index_df.columns.names) > 1:
# If so, drop cid because it won't go into the body of the metadata
mi_df_columns = multi_index_df.columns.droplevel(cid)
# Names of the multiindex levels become the headers
chds = list(mi_df_columns.names)
# Assemble metadata values
col_metadata = np.array([mi_df_columns.get_level_values(level).values for level in list(chds)]).T
# If there is one level in columns (python3), then rhds and row metadata should be empty
else:
chds = []
col_metadata = []
# If the columns are not multi-index, then rhds and row metadata should be empty
else:
chds = []
col_metadata = []
# Create component dfs
row_metadata_df = pd.DataFrame.from_records(row_metadata, index=pd.Index(rids, name="rid"), columns=pd.Index(rhds, name="rhd"))
col_metadata_df = pd.DataFrame.from_records(col_metadata, index=pd.Index(cids, name="cid"), columns=pd.Index(chds, name="chd"))
data_df = pd.DataFrame(multi_index_df.values, index=pd.Index(rids, name="rid"), columns=pd.Index(cids, name="cid"))
return data_df, row_metadata_df, col_metadata_df