Module `plumbing.xls_tables`

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

Expand source code

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Written by Lucas Sinclair.
MIT Licensed.
Contact at www.sinclair.bio
"""

# Built-in modules #

# First party modules #
from autopaths import Path

# Third party modules #
import pandas

# Internal modules #
from plumbing.cache import property_cached

###############################################################################
class MultiDataFrameXLS:
    """
    Takes several dataframes and writes them to an XLS file.
    The dataframes are spread through different work sheets.

    In addition, each work sheet can contain an arbitrary number
    of dataframes.

    You have to provide a dictionary where:

    * Each key is the name of a given work sheet in the final XLS as a string.
    * Each value is a list containing an arbitrary number of dictionaries.

    * Each one of these dictionaries must contain a DataFrame in the
      'dataframe' key, as well as optional extra labels as seen below.

            sheet = {
                'dataframe': df,
                'title':     "Best dataframe ever",
                'x_title':   "Foot length",
                'y_title':   "Yearly income",
                'x_label':   None,
                'y_label':   None,
                'x_extra':   None,
                'y_extra':   None,
            }
    """

    # Parameters #
    spacing = 6
    indentation = 1

    def __init__(self, sheet_to_dfs, path):
        self.sheet_to_dfs = sheet_to_dfs
        self.path         = Path(path)

    def __call__(self):
        """
        Write several dataframes, to several excel sheets.
        """
        # Create path if not exists #
        self.path.directory.create_if_not_exists()
        # Create a writer #
        self.writer = pandas.ExcelWriter(str(self.path), engine='xlsxwriter')
        # Create a sheet per every key #
        for key in self.sheet_to_dfs:
            worksheet = self.writer.book.add_worksheet(key)
            self.writer.sheets[key] = worksheet
        # Write each sheet #
        for key in self.sheet_to_dfs: self.write_one_sheet(key)
        # Save #
        self.writer.save()
        # Return #
        return self.path

    def write_one_sheet(self, key):
        """
        Write several dataframes, all to the same excel sheet.
        It will append a custom title before hand for each
        dataframe.
        """
        # Get sheet #
        sheet = self.writer.sheets[key]
        # Get dataframes #
        all_dfs = self.sheet_to_dfs[key]
        # Initialize #
        row = 0
        # Loop #
        for info in all_dfs:
            # Get dataframe #
            df = info['dataframe']
            # Write custom title #
            sheet.write_string(row, 0, info.get('title', ''))
            row += 2
            # Add extras #
            df.index.name   = info.get('y_extra', '')
            df.columns.name = info.get('x_extra', '')
            # Add Y labels #
            title, label = info.get('y_title', ''), info.get('y_label', '')
            df = pandas.concat({title: df}, names=[label])
            # Add X labels #
            title, label = info.get('x_title', ''), info.get('x_label', '')
            df = pandas.concat({title: df}, names=[label], axis=1)
            # Write dataframe #
            df.to_excel(self.writer,
                        sheet_name = key,
                        startrow   = row,
                        startcol   = self.indentation)
            # Increment #
            row += len(df.index) + self.spacing

###############################################################################
class ConvertExcelToCSV:
    """
    Will convert an excel file into its CSV equivalent.
    Can support multiple work sheets into a single CSV.
    """

    def __init__(self, source_path, dest_path, **kwargs):
        # Record attributes #
        self.source = Path(source_path)
        self.dest   = Path(dest_path)
        # Keep the kwargs too #
        self.kwargs = kwargs
        # Check directory case #
        if self.dest.endswith('/'):
            self.dest = self.dest + self.source.filename
            self.dest = self.dest.replace_extension('csv')

    def __call__(self):
        """Are we mono or multi sheet?"""
        if len(self.handle.sheet_names) > 1: self.multi_sheet()
        else:                                self.mono_sheet()

    @property_cached
    def handle(self):
        """Pandas handle to the excel file."""
        return pandas.ExcelFile(str(self.source))

    def mono_sheet(self):
        """Supports only one work sheet per file."""
        xls = pandas.read_excel(str(self.source))
        xls.to_csv(str(self.dest), **self.kwargs)

    def multi_sheet(self):
        """
        Supports multiple work sheets per file.
        Will concatenate sheets together by adding an extra column
        containing the original sheet name.
        """
        # Initialize #
        all_sheets = []
        # Loop #
        for name in self.handle.sheet_names:
            sheet = self.handle.parse(name)
            sheet.insert(0, "nace", name)
            all_sheets.append(sheet)
        # Write #
        df = pandas.concat(all_sheets)
        df.to_csv(str(self.dest), **self.kwargs)

Classes

class ConvertExcelToCSV (source_path, dest_path, **kwargs)

Will convert an excel file into its CSV equivalent. Can support multiple work sheets into a single CSV.

Expand source code

class ConvertExcelToCSV:
    """
    Will convert an excel file into its CSV equivalent.
    Can support multiple work sheets into a single CSV.
    """

    def __init__(self, source_path, dest_path, **kwargs):
        # Record attributes #
        self.source = Path(source_path)
        self.dest   = Path(dest_path)
        # Keep the kwargs too #
        self.kwargs = kwargs
        # Check directory case #
        if self.dest.endswith('/'):
            self.dest = self.dest + self.source.filename
            self.dest = self.dest.replace_extension('csv')

    def __call__(self):
        """Are we mono or multi sheet?"""
        if len(self.handle.sheet_names) > 1: self.multi_sheet()
        else:                                self.mono_sheet()

    @property_cached
    def handle(self):
        """Pandas handle to the excel file."""
        return pandas.ExcelFile(str(self.source))

    def mono_sheet(self):
        """Supports only one work sheet per file."""
        xls = pandas.read_excel(str(self.source))
        xls.to_csv(str(self.dest), **self.kwargs)

    def multi_sheet(self):
        """
        Supports multiple work sheets per file.
        Will concatenate sheets together by adding an extra column
        containing the original sheet name.
        """
        # Initialize #
        all_sheets = []
        # Loop #
        for name in self.handle.sheet_names:
            sheet = self.handle.parse(name)
            sheet.insert(0, "nace", name)
            all_sheets.append(sheet)
        # Write #
        df = pandas.concat(all_sheets)
        df.to_csv(str(self.dest), **self.kwargs)

Instance variables

var handle

Pandas handle to the excel file.

Expand source code

def __get__(self, instance, owner):
    """
    If you see the current source code in a seemingly unrelated part of
    an auto-generated documentation, it means the program making the
    documentation was unable to correctly traverse a decorated property.
    """
    # For debugging purposes #
    if False: print("-> property cached `%s`" % instance)
    # If called from a class #
    if instance is None: return self
    # Does a cache exist for this instance? #
    self.check_cache(instance)
    # Is the answer in the cache? #
    if self.name in instance.__cache__: return instance.__cache__[self.name]
    # If not we will compute it #
    if inspect.isgeneratorfunction(self.func): result = tuple(self.func(instance))
    else:                                      result = self.func(instance)
    # Let's store the answer for later #
    instance.__cache__[self.name] = result
    # Return #
    return result

Methods

def mono_sheet(self)

Supports only one work sheet per file.

Expand source code

def mono_sheet(self):
    """Supports only one work sheet per file."""
    xls = pandas.read_excel(str(self.source))
    xls.to_csv(str(self.dest), **self.kwargs)

def multi_sheet(self)

Supports multiple work sheets per file. Will concatenate sheets together by adding an extra column containing the original sheet name.

Expand source code

def multi_sheet(self):
    """
    Supports multiple work sheets per file.
    Will concatenate sheets together by adding an extra column
    containing the original sheet name.
    """
    # Initialize #
    all_sheets = []
    # Loop #
    for name in self.handle.sheet_names:
        sheet = self.handle.parse(name)
        sheet.insert(0, "nace", name)
        all_sheets.append(sheet)
    # Write #
    df = pandas.concat(all_sheets)
    df.to_csv(str(self.dest), **self.kwargs)

class MultiDataFrameXLS (sheet_to_dfs, path)

Takes several dataframes and writes them to an XLS file. The dataframes are spread through different work sheets.

In addition, each work sheet can contain an arbitrary number of dataframes.

You have to provide a dictionary where:

Each key is the name of a given work sheet in the final XLS as a string.
Each value is a list containing an arbitrary number of dictionaries.

Each one of these dictionaries must contain a DataFrame in the 'dataframe' key, as well as optional extra labels as seen below.

sheet = {
    'dataframe': df,
    'title':     "Best dataframe ever",
    'x_title':   "Foot length",
    'y_title':   "Yearly income",
    'x_label':   None,
    'y_label':   None,
    'x_extra':   None,
    'y_extra':   None,
}

Expand source code

class MultiDataFrameXLS:
    """
    Takes several dataframes and writes them to an XLS file.
    The dataframes are spread through different work sheets.

    In addition, each work sheet can contain an arbitrary number
    of dataframes.

    You have to provide a dictionary where:

    * Each key is the name of a given work sheet in the final XLS as a string.
    * Each value is a list containing an arbitrary number of dictionaries.

    * Each one of these dictionaries must contain a DataFrame in the
      'dataframe' key, as well as optional extra labels as seen below.

            sheet = {
                'dataframe': df,
                'title':     "Best dataframe ever",
                'x_title':   "Foot length",
                'y_title':   "Yearly income",
                'x_label':   None,
                'y_label':   None,
                'x_extra':   None,
                'y_extra':   None,
            }
    """

    # Parameters #
    spacing = 6
    indentation = 1

    def __init__(self, sheet_to_dfs, path):
        self.sheet_to_dfs = sheet_to_dfs
        self.path         = Path(path)

    def __call__(self):
        """
        Write several dataframes, to several excel sheets.
        """
        # Create path if not exists #
        self.path.directory.create_if_not_exists()
        # Create a writer #
        self.writer = pandas.ExcelWriter(str(self.path), engine='xlsxwriter')
        # Create a sheet per every key #
        for key in self.sheet_to_dfs:
            worksheet = self.writer.book.add_worksheet(key)
            self.writer.sheets[key] = worksheet
        # Write each sheet #
        for key in self.sheet_to_dfs: self.write_one_sheet(key)
        # Save #
        self.writer.save()
        # Return #
        return self.path

    def write_one_sheet(self, key):
        """
        Write several dataframes, all to the same excel sheet.
        It will append a custom title before hand for each
        dataframe.
        """
        # Get sheet #
        sheet = self.writer.sheets[key]
        # Get dataframes #
        all_dfs = self.sheet_to_dfs[key]
        # Initialize #
        row = 0
        # Loop #
        for info in all_dfs:
            # Get dataframe #
            df = info['dataframe']
            # Write custom title #
            sheet.write_string(row, 0, info.get('title', ''))
            row += 2
            # Add extras #
            df.index.name   = info.get('y_extra', '')
            df.columns.name = info.get('x_extra', '')
            # Add Y labels #
            title, label = info.get('y_title', ''), info.get('y_label', '')
            df = pandas.concat({title: df}, names=[label])
            # Add X labels #
            title, label = info.get('x_title', ''), info.get('x_label', '')
            df = pandas.concat({title: df}, names=[label], axis=1)
            # Write dataframe #
            df.to_excel(self.writer,
                        sheet_name = key,
                        startrow   = row,
                        startcol   = self.indentation)
            # Increment #
            row += len(df.index) + self.spacing

Class variables

var indentation
var spacing

Methods

def write_one_sheet(self, key)

Write several dataframes, all to the same excel sheet. It will append a custom title before hand for each dataframe.

Expand source code

def write_one_sheet(self, key):
    """
    Write several dataframes, all to the same excel sheet.
    It will append a custom title before hand for each
    dataframe.
    """
    # Get sheet #
    sheet = self.writer.sheets[key]
    # Get dataframes #
    all_dfs = self.sheet_to_dfs[key]
    # Initialize #
    row = 0
    # Loop #
    for info in all_dfs:
        # Get dataframe #
        df = info['dataframe']
        # Write custom title #
        sheet.write_string(row, 0, info.get('title', ''))
        row += 2
        # Add extras #
        df.index.name   = info.get('y_extra', '')
        df.columns.name = info.get('x_extra', '')
        # Add Y labels #
        title, label = info.get('y_title', ''), info.get('y_label', '')
        df = pandas.concat({title: df}, names=[label])
        # Add X labels #
        title, label = info.get('x_title', ''), info.get('x_label', '')
        df = pandas.concat({title: df}, names=[label], axis=1)
        # Write dataframe #
        df.to_excel(self.writer,
                    sheet_name = key,
                    startrow   = row,
                    startcol   = self.indentation)
        # Increment #
        row += len(df.index) + self.spacing