Source code for cmapPy.pandasGEXpress.write_gct

import logging
import pandas as pd
import numpy as np
import os
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger

__author__ = "Lev Litichevskiy"
__email__ = "lev@broadinstitute.org"

logger = logging.getLogger(setup_logger.LOGGER_NAME)

# Only writes GCT1.3
VERSION = "1.3"


[docs]def write(gctoo, out_fname, data_null="NaN", metadata_null="-666", filler_null="-666", data_float_format="%.4f"): """Write a gctoo object to a gct file. Args: gctoo (gctoo object) out_fname (string): filename for output gct file data_null (string): how to represent missing values in the data (default = "NaN") metadata_null (string): how to represent missing values in the metadata (default = "-666") filler_null (string): what value to fill the top-left filler block with (default = "-666") data_float_format (string): how many decimal points to keep in representing data (default = 4 digits; None will keep all digits) Returns: None """ # Create handle for output file if not out_fname.endswith(".gct"): out_fname += ".gct" f = open(out_fname, "w") # Write first two lines dims = [str(gctoo.data_df.shape[0]), str(gctoo.data_df.shape[1]), str(gctoo.row_metadata_df.shape[1]), str(gctoo.col_metadata_df.shape[1])] write_version_and_dims(VERSION, dims, f) # Write top half of the gct write_top_half(f, gctoo.row_metadata_df, gctoo.col_metadata_df, metadata_null, filler_null) # Write bottom half of the gct write_bottom_half(f, gctoo.row_metadata_df, gctoo.data_df, data_null, data_float_format, metadata_null) f.close()
logger.info("GCT has been written to {}".format(out_fname)) def write_version_and_dims(version, dims, f): """Write first two lines of gct file. Args: version (string): 1.3 by default dims (list of strings): length = 4 f (file handle): handle of output file Returns: nothing """ f.write(("#" + version + "\n")) f.write((dims[0] + "\t" + dims[1] + "\t" + dims[2] + "\t" + dims[3] + "\n")) def write_top_half(f, row_metadata_df, col_metadata_df, metadata_null, filler_null): """ Write the top half of the gct file: top-left filler values, row metadata headers, and top-right column metadata. Args: f (file handle): handle for output file row_metadata_df (pandas df) col_metadata_df (pandas df) metadata_null (string): how to represent missing values in the metadata filler_null (string): what value to fill the top-left filler block with Returns: None """ # Initialize the top half of the gct including the third line size_of_top_half_df = (1 + col_metadata_df.shape[1], 1 + row_metadata_df.shape[1] + col_metadata_df.shape[0]) top_half_df = pd.DataFrame(np.full(size_of_top_half_df, filler_null, dtype=object)) # Assemble the third line of the gct: "id", then rhds, then cids top_half_df.iloc[0, :] = np.hstack(("id", row_metadata_df.columns.values, col_metadata_df.index.values)) # Insert the chds top_half_df.iloc[range(1, top_half_df.shape[0]), 0] = col_metadata_df.columns.values # Insert the column metadata, but first convert to strings and replace NaNs col_metadata_indices = (range(1, top_half_df.shape[0]), range(1 + row_metadata_df.shape[1], top_half_df.shape[1])) # pd.DataFrame.at to insert into dataframe(python3) top_half_df.at[col_metadata_indices[0], col_metadata_indices[1]] = ( col_metadata_df.astype(str).replace("nan", value=metadata_null).T.values) # Write top_half_df to file top_half_df.to_csv(f, header=False, index=False, sep="\t") def write_bottom_half(f, row_metadata_df, data_df, data_null, data_float_format, metadata_null): """ Write the bottom half of the gct file: row metadata and data. Args: f (file handle): handle for output file row_metadata_df (pandas df) data_df (pandas df) data_null (string): how to represent missing values in the data metadata_null (string): how to represent missing values in the metadata data_float_format (string): how many decimal points to keep in representing data Returns: None """ # create the left side of the bottom half of the gct (for the row metadata) size_of_left_bottom_half_df = (row_metadata_df.shape[0], 1 + row_metadata_df.shape[1]) left_bottom_half_df = pd.DataFrame(np.full(size_of_left_bottom_half_df, metadata_null, dtype=object)) #create the full bottom half by combining with the above with the matrix data bottom_half_df = pd.concat([left_bottom_half_df, data_df.reset_index(drop=True)], axis=1) bottom_half_df.columns = range(bottom_half_df.shape[1]) # Insert the rids bottom_half_df.iloc[:, 0] = row_metadata_df.index.values # Insert the row metadata, but first convert to strings and replace NaNs row_metadata_col_indices = range(1, 1 + row_metadata_df.shape[1]) bottom_half_df.iloc[:, row_metadata_col_indices] = ( row_metadata_df.astype(str).replace("nan", value=metadata_null).values) # Write bottom_half_df to file bottom_half_df.to_csv(f, header=False, index=False, sep="\t", na_rep=data_null, float_format=data_float_format) def append_dims_and_file_extension(fname, data_df): """Append dimensions and file extension to output filename. N.B. Dimensions are cols x rows. Args: fname (string): output filename data_df (pandas df) Returns: out_fname (string): output filename with matrix dims and .gct appended """ # If there's no .gct at the end of output file name, add the dims and .gct if not fname.endswith(".gct"): out_fname = '{0}_n{1}x{2}.gct'.format(fname, data_df.shape[1], data_df.shape[0]) return out_fname # Otherwise, only add the dims else: basename = os.path.splitext(fname)[0] out_fname = '{0}_n{1}x{2}.gct'.format(basename, data_df.shape[1], data_df.shape[0]) return out_fname