Source code for asaptools.partition

"""
A module for data partitioning functions.

This provides a collection of 'partitioning' functions.  A partitioning
function is a three-argument function that takes, as the first argument, a
given data object and, as the second argument, an index into that object and,
as the third argument, a maximum index.  The operation of the partitioning
function is to return a subset of the data corresponding to the given index.

By design, partitioning functions should keep the data "unchanged" except for
subselecting parts of the data.


Copyright 2020 University Corporation for Atmospheric Research

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

from abc import ABCMeta, abstractmethod
from operator import itemgetter


[docs]class PartitionFunction(object):

    """
    The abstract base-class for all Partitioning Function objects.

    A PartitionFunction object is one with a __call__ method that takes
    three arguments.  The first argument is the data to be partitioned, the
    second argument is the index of the partition (or part) requested, and
    third argument is the number of partitions to assume when dividing
    the data.
    """

    __metaclass__ = ABCMeta

    @staticmethod
    def _check_types(data, index, size):
        """
        Check the types of the index and size arguments.

        Parameters:
            data: The data to be partitioned
            index (int): The index of the partition to return
            size (int): The number of partitions to make

        Raises:
            TypeError: The size or index arguments are not int
            IndexError: The size argument is less than 1, or the index
                argument is less than 0 or greater than or equal to size
        """

        # Check the type of the index
        if type(index) is not int:
            raise TypeError('Partition index must be an integer')

        # Check the value of index
        if index > size - 1 or index < 0:
            raise IndexError('Partition index out of bounds')

        # Check the type of the size
        if type(size) is not int:
            raise TypeError('Partition size must be an integer')

        # Check the value of size
        if size < 1:
            raise IndexError('Partition size less than 1 is invalid')

    @staticmethod
    def _is_indexable(data):
        """
        Check if the data object is indexable.

        Parameters:
            data: The data to be partitioned

        Returns:
            bool: True, if data is an indexable object. False, otherwise.
        """
        if hasattr(data, '__len__') and hasattr(data, '__getitem__'):
            return True
        else:
            return False

    @staticmethod
    def _are_pairs(data):
        """
        Check if the data object is an indexable list of pairs.

        Parameters:
            data: The data to be partitioned

        Returns:
            bool: True, if data is an indexable list of pairs.
                False, otherwise.
        """
        if PartitionFunction._is_indexable(data):
            return all(map(lambda i: PartitionFunction._is_indexable(i) and len(i) == 2, data))
        else:
            return False

    @abstractmethod
    def __call__(self):
        """
        Implements the partition algorithm.
        """
        return


[docs]class Duplicate(PartitionFunction):

    """
    Return a copy of the original input data in each partition.
    """

    def __call__(self, data, index=0, size=1):
        """
        Define the common interface for all partitioning functions.

        The abstract base class implements the check on the input for correct
        format and typing.

        Parameters:
            data: The data to be partitioned

        Keyword Arguments:
            index (int): A partition index into a part of the data
            size (int): The largest number of partitions allowed

        Returns:
            The indexed part of the data, assuming the data is divided into
            size parts.
        """
        self._check_types(data, index, size)

        return data


[docs]class EqualLength(PartitionFunction):

    """
    Partition an indexable object by striding through the data.

    The initial object is "chopped" along its length into roughly equal length
    sublists.  If the partition size is greater than the length of the input
    data, then it will return an empty list for 'empty' partitions.  If the
    data is not indexable, then it will return the data for index=0 only, and
    an empty list otherwise.
    """

    def __call__(self, data, index=0, size=1):
        """
        Define the common interface for all partitioning functions.

        The abstract base class implements the check on the input for correct
        format and typing.

        Parameters:
            data: The data to be partitioned

        Keyword Arguments:
            index (int): A partition index into a part of the data
            size (int): The largest number of partitions allowed

        Returns:
            The indexed part of the data, assuming the data is divided into
            size parts.
        """
        self._check_types(data, index, size)

        if self._is_indexable(data):
            (lenpart, remdata) = divmod(len(data), size)
            psizes = [lenpart] * size
            for i in range(remdata):
                psizes[i] += 1
            ibeg = 0
            for i in range(index):
                ibeg += psizes[i]
            iend = ibeg + psizes[index]
            return data[ibeg:iend]
        else:
            if index == 0:
                return [data]
            else:
                return []


[docs]class EqualStride(PartitionFunction):

    """
    Partition an object by chopping the data into roughly equal lengths.

    This returns a sublist of an indexable object by "striding" through the
    data in steps equal to the partition size.  If the partition size is
    greater than the length of the input data, then it will return an empty
    list for "empty" partitions.  If the data is not indexable, then it will
    return the data for index=0 only, and an empty list otherwise.
    """

    def __call__(self, data, index=0, size=1):
        """
        Define the common interface for all partitioning functions.

        The abstract base class implements the check on the input for correct
        format and typing.

        Parameters:
            data: The data to be partitioned

        Keyword Arguments:
            index (int): A partition index into a part of the data
            size (int): The largest number of partitions allowed

        Returns:
            The indexed part of the data, assuming the data is divided into
            size parts.
        """
        self._check_types(data, index, size)

        if self._is_indexable(data):
            if index < len(data):
                return data[index::size]
            else:
                return []
        else:
            if index == 0:
                return [data]
            else:
                return []


[docs]class SortedStride(PartitionFunction):

    """
    Partition an indexable list of pairs by striding through sorted data.

    The first index of each pair is assumed to be an item of data (which will
    be partitioned), and the second index in each pair is assumed to be a
    numeric weight.  The pairs are first sorted by weight, and then partitions
    are returned by striding through the sorted data.

    The results are partitions of roughly equal length and roughly equal
    total weight.  However, equal length is prioritized over total weight.
    """

    def __call__(self, data, index=0, size=1):
        """
        Define the common interface for all partitioning functions.

        The abstract base class implements the check on the input for correct
        format and typing.

        Parameters:
            data: The data to be partitioned

        Keyword Arguments:
            index (int): A partition index into a part of the data
            size (int): The largest number of partitions allowed

        Returns:
            The indexed part of the data, assuming the data is divided into
            size parts.
        """
        self._check_types(data, index, size)

        if self._are_pairs(data):
            subdata = [q[0] for q in sorted(data, key=itemgetter(1))]
            return EqualStride()(subdata, index=index, size=size)
        else:
            return EqualStride()(data, index=index, size=size)


[docs]class WeightBalanced(PartitionFunction):

    """
    Partition an indexable list of pairs by balancing the total weight.

    The first index of each pair is assumed to be an item of data (which will
    be partitioned), and the second index in each pair is assumed to be a
    numeric weight.  The data items are grouped via a "greedy" binning
    algorithm into partitions of roughly equal total weight.

    The results are partitions of roughly equal length and roughly equal
    total weight.  However, equal total weight is prioritized over length.

    """

    def __call__(self, data, index=0, size=1):
        """
        Define the common interface for all partitioning functions.

        The abstract base class implements the check on the input for correct
        format and typing.

        Parameters:
            data: The data to be partitioned

        Keyword Arguments:
            index (int): A partition index into a part of the data
            size (int): The largest number of partitions allowed

        Returns:
            The indexed part of the data, assuming the data is divided into
            size parts.
        """
        self._check_types(data, index, size)

        if self._are_pairs(data):
            sorted_pairs = sorted(data, key=itemgetter(1), reverse=True)
            partition = []
            weights = [0] * size
            for (item, weight) in sorted_pairs:
                k = min(enumerate(weights), key=itemgetter(1))[0]
                if k == index:
                    partition.append(item)
                weights[k] += weight
            return partition
        else:
            return EqualStride()(data, index=index, size=size)