"""
concat.py
This function is for concatenating gct(x) files together. You can tell it to
find files using the file_wildcard argument, or you can tell it exactly
which files you want to concatenate using the input_filepaths argument. The
meat of this function are the hstack (i.e. horizontal concatenation of GCToo objects)
and vstack (i.e. vertical concatenation).
Terminology: 'Common' metadata refers to the metadata that is shared between
the loaded GCToo's. For example, if horizontally concatenating, the 'common' metadata is
the row metadata. 'Concatenated' metadata is the other one; it's the metadata
for the entries being concatenated together. For example, if horizontally
concatenating, the 'concatenated' metadata is the column metadata because
columns are being concatenated together.
There are 2 arguments that allow you to work around certain obstacles
of concatenation.
1) If the 'common' metadata contains fields that are not the same in
all files, then you will need to remove these fields using the
fields_to_remove argument.
2) If the 'concatenated' metadata ids are not unique between different files,
and you try to concatenate the files, an invalid GCToo would be formed
(duplicate ids). To overcome this, use the reset_sample_ids argument. This will
move the 'new' metadata ids to a new metadata field and replace the original ids
with unique integers.
N.B. This script sorts everything!
"""
import argparse
import os
import sys
import glob
import logging
import numpy
import pandas as pd
import cmapPy.pandasGEXpress.GCToo as GCToo
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
import cmapPy.pandasGEXpress.parse as parse
import cmapPy.pandasGEXpress.write_gct as write_gct
import cmapPy.pandasGEXpress.write_gctx as write_gctx
__author__ = "Lev Litichevskiy"
__email__ = "lev@broadinstitute.org"
logger = logging.getLogger(setup_logger.LOGGER_NAME)
def build_parser():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Required args
parser.add_argument("--concat_direction", "-d", required=True,
choices=["horiz", "vert"],
help="which direction to concatenate")
mutually_exclusive_group = parser.add_mutually_exclusive_group()
mutually_exclusive_group.add_argument("--input_filepaths", "-if", nargs="+",
help="full paths to gct(x) files to be concatenated")
mutually_exclusive_group.add_argument("--file_wildcard", "-w", type=str,
help=("wildcard specifying where files should be found " +
"(make sure to surround in quotes if calling from command line!)"))
parser.add_argument("--out_type", "-ot", default="gctx", choices=["gct", "gctx"],
help="whether to save output as a gct or gctx")
parser.add_argument("--out_name", "-o", type=str, default="concated.gctx",
help="what to name the output file")
parser.add_argument("--fields_to_remove", "-ftr", nargs="+", default=[],
help="fields to remove from the common metadata")
parser.add_argument("--remove_all_metadata_fields", "-ramf", action="store_true", default=False,
help="remove all metadata fields during operation")
parser.add_argument("--reset_ids", "-rsi", action="store_true", default=False,
help="whether to reset ids (use this flag if ids are not unique)")
parser.add_argument("-data_null", type=str, default="NaN",
help="how to represent missing values in the data")
parser.add_argument("-metadata_null", type=str, default="-666",
help="how to represent missing values in the metadata")
parser.add_argument("-filler_null", type=str, default="-666",
help="what value to use for filling the top-left filler block if output is a .gct")
parser.add_argument("-verbose", "-v", action="store_true", default=False,
help="whether to print a bunch of output")
parser.add_argument("--error_report_output_file", "-erof", type=str, default=None,
help="""destination file for writing out error report - currently information about inconsistent
metadata fields in the common dimension of the concat operation""")
return parser
def main():
# get args
args = build_parser().parse_args(sys.argv[1:])
setup_logger.setup(verbose=args.verbose)
logger.debug("args: {}".format(args))
concat_main(args)
[docs]def concat_main(args):
""" Separate method from main() in order to make testing easier and to
enable command-line access. """
# Get files directly
if args.input_filepaths is not None:
files = args.input_filepaths
# Or find them
else:
files = get_file_list(args.file_wildcard)
# No files found
if len(files) == 0:
msg = "No files were found. args.file_wildcard: {}".format(args.file_wildcard)
logger.error(msg)
raise Exception(msg)
# Only 1 file found
if len(files) == 1:
logger.warning("Only 1 file found. No concatenation needs to be done, exiting")
return
# More than 1 file found
else:
# Parse each file and append to a list
gctoos = []
for f in files:
gctoos.append(parse.parse(f))
# Create concatenated gctoo object
if args.concat_direction == "horiz":
out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file,
args.fields_to_remove, args.reset_ids)
elif args.concat_direction == "vert":
out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file,
args.fields_to_remove, args.reset_ids)
# Write out_gctoo to file
logger.info("Writing to output file args.out_name: {}".format(args.out_name))
if args.out_type == "gctx":
write_gctx.write(out_gctoo, args.out_name)
elif args.out_type == "gct":
write_gct.write(out_gctoo, args.out_name,
filler_null=args.filler_null,
metadata_null=args.metadata_null,
data_null=args.data_null)
[docs]def get_file_list(wildcard):
""" Search for files to be concatenated. Currently very basic, but could
expand to be more sophisticated.
Args:
wildcard (regular expression string)
Returns:
files (list of full file paths)
"""
files = glob.glob(os.path.expanduser(wildcard))
return files
[docs]def hstack(gctoos, remove_all_metadata_fields=False, error_report_file=None, fields_to_remove=[], reset_ids=False):
""" Horizontally concatenate gctoos.
Args:
gctoos (list of gctoo objects)
remove_all_metadata_fields (bool): ignore/strip all common metadata when combining gctoos
error_report_file (string): path to write file containing error report indicating
problems that occurred during hstack, mainly for inconsistencies in common metadata
fields_to_remove (list of strings): fields to be removed from the
common metadata because they don't agree across files
reset_ids (bool): set to True if sample ids are not unique
Return:
concated (gctoo object)
"""
# Separate each gctoo into its component dfs
row_meta_dfs = []
col_meta_dfs = []
data_dfs = []
srcs = []
for g in gctoos:
row_meta_dfs.append(g.row_metadata_df)
col_meta_dfs.append(g.col_metadata_df)
data_dfs.append(g.data_df)
srcs.append(g.src)
logger.debug("shapes of row_meta_dfs: {}".format([x.shape for x in row_meta_dfs]))
# Concatenate row metadata
all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
# Concatenate col metadata
all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs, remove_all_metadata_fields)
# Concatenate the data_dfs
all_data_df = assemble_data(data_dfs, "horiz")
# Make sure df shapes are correct
assert all_data_df.shape[0] == all_row_metadata_df.shape[0], "Number of rows in metadata does not match number of rows in data - all_data_df.shape[0]: {} all_row_metadata_df.shape[0]: {}".format(all_data_df.shape[0], all_row_metadata_df.shape[0])
assert all_data_df.shape[1] == all_col_metadata_df.shape[0], "Number of columns in data does not match number of columns metadata - all_data_df.shape[1]: {} all_col_metadata_df.shape[0]: {}".format(all_data_df.shape[1], all_col_metadata_df.shape[0])
# If requested, reset sample ids to be unique integers and move old sample
# ids into column metadata
if reset_ids:
do_reset_ids(all_col_metadata_df, all_data_df, "horiz")
logger.info("Build GCToo of all...")
concated = GCToo.GCToo(row_metadata_df=all_row_metadata_df,
col_metadata_df=all_col_metadata_df,
data_df=all_data_df)
return concated
[docs]def vstack(gctoos, remove_all_metadata_fields=False, error_report_file=None, fields_to_remove=[], reset_ids=False):
""" Vertically concatenate gctoos.
Args:
gctoos (list of gctoo objects)
remove_all_metadata_fields (bool): ignore/strip all common metadata when combining gctoos
error_report_file (string): path to write file containing error report indicating
problems that occurred during vstack, mainly for inconsistencies in common metadata
fields_to_remove (list of strings): fields to be removed from the
common metadata because they don't agree across files
reset_ids (bool): set to True if row ids are not unique
Return:
concated (gctoo object)
"""
# Separate each gctoo into its component dfs
row_meta_dfs = []
col_meta_dfs = []
data_dfs = []
srcs = []
for g in gctoos:
row_meta_dfs.append(g.row_metadata_df)
col_meta_dfs.append(g.col_metadata_df)
data_dfs.append(g.data_df)
srcs.append(g.src)
# Concatenate col metadata
all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
# Concatenate col metadata
all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs, remove_all_metadata_fields)
# Concatenate the data_dfs
all_data_df = assemble_data(data_dfs, "vert")
# Make sure df shapes are correct
assert all_data_df.shape[0] == all_row_metadata_df.shape[0], "Number of rows is incorrect."
assert all_data_df.shape[1] == all_col_metadata_df.shape[0], "Number of columns is incorrect."
# If requested, reset sample ids to be unique integers and move old sample
# ids into column metadata
if reset_ids:
do_reset_ids(all_row_metadata_df, all_data_df, "vert")
logger.info("Build GCToo of all...")
concated = GCToo.GCToo(row_metadata_df=all_row_metadata_df,
col_metadata_df=all_col_metadata_df,
data_df=all_data_df)
return concated
return all_meta_df_sorted
return (all_meta_df, all_meta_df_with_dups)
return all_report_df
return all_concated_meta_df_sorted
[docs]def assemble_data(data_dfs, concat_direction):
""" Assemble the data dfs together. Both indices are sorted.
Args:
data_dfs (list of pandas dfs)
concat_direction (string): 'horiz' or 'vert'
Returns:
all_data_df_sorted (pandas df)
"""
if concat_direction == "horiz":
# Concatenate the data_dfs horizontally
all_data_df = pd.concat(data_dfs, axis=1)
# Sanity check: the number of columns in all_data_df should
# correspond to the sum of the number of columns in the input dfs
n_cols = all_data_df.shape[1]
logger.debug("all_data_df.shape[1]: {}".format(n_cols))
n_cols_cumulative = sum([df.shape[1] for df in data_dfs])
assert n_cols == n_cols_cumulative
elif concat_direction == "vert":
# Concatenate the data_dfs vertically
all_data_df = pd.concat(data_dfs, axis=0)
# Sanity check: the number of rows in all_data_df should
# correspond to the sum of the number of rows in the input dfs
n_rows = all_data_df.shape[0]
logger.debug("all_data_df.shape[0]: {}".format(n_rows))
n_rows_cumulative = sum([df.shape[0] for df in data_dfs])
assert n_rows == n_rows_cumulative
# Sort both indices
all_data_df_sorted = all_data_df.sort_index(axis=0).sort_index(axis=1)
return all_data_df_sorted
[docs]def do_reset_ids(concatenated_meta_df, data_df, concat_direction):
""" Reset ids in concatenated metadata and data dfs to unique integers and
save the old ids in a metadata column.
Note that the dataframes are modified in-place.
Args:
concatenated_meta_df (pandas df)
data_df (pandas df)
concat_direction (string): 'horiz' or 'vert'
Returns:
None (dfs modified in-place)
"""
if concat_direction == "horiz":
# Make sure cids agree between data_df and concatenated_meta_df
assert concatenated_meta_df.index.equals(data_df.columns), (
"cids in concatenated_meta_df do not agree with cids in data_df.")
# Reset cids in concatenated_meta_df
reset_ids_in_meta_df(concatenated_meta_df)
# Replace cids in data_df with the new ones from concatenated_meta_df
# (just an array of unique integers, zero-indexed)
data_df.columns = pd.Index(concatenated_meta_df.index.values)
elif concat_direction == "vert":
# Make sure rids agree between data_df and concatenated_meta_df
assert concatenated_meta_df.index.equals(data_df.index), (
"rids in concatenated_meta_df do not agree with rids in data_df.")
# Reset rids in concatenated_meta_df
reset_ids_in_meta_df(concatenated_meta_df)
# Replace rids in data_df with the new ones from concatenated_meta_df
# (just an array of unique integers, zero-indexed)
data_df.index = pd.Index(concatenated_meta_df.index.values)
meta_df.index.name = original_index_name
pass
if __name__ == "__main__":
main()