| home | about me | feed RSS feed

The code below is intended to provide SQL's GROUPING SETS functionality in Python with the aid of pandas.

Background on GROUPING SETS

There are at least two advantages to doing this in Python:

There is at least one disadvantage to doing this in Python:

The example below has been tested with Python 2.7.10 and Pandas 0.17.0.

from __future__ import division, print_function

import itertools as it
import pandas as pd

from pandas.util.testing import assert_frame_equal

def powerset(iterable): 
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return it.chain.from_iterable(it.combinations(s,r) for r in range(len(s)+1))

def grouper(df,grpby=None,aggfunc=None):
    ''' produces aggregate DataFrame from DataFrames for non-redundant groupings
       `workingdf` is used to avoid modifying original DataFrame
    uniqcols = (i for i in grpby if len(df[i].unique()) == 1)
    pwrset = (i for i in powerset(grpby))
    s = set()
    for uniqcol in uniqcols:
        for i in pwrset:
            if uniqcol in i:
                s.add(i) # add a level of aggregation only when non-redundant

    workingdf = df.copy()
    #for idx,i in enumerate(powerset(grpby)):
    for idx,i in enumerate(s):
        print('  grouping by: {}'.format(i))
        if i != (): 
            tmp = aggfunc( workingdf.groupby(i) )
            # hack to get output to be a DataFrameGroupBy object:
            #   insert dummy column on which to group by
            # old, naive code:
            #   tmp = aggfunc( workingdf )
            dummycolname = hash(tuple(workingdf.columns.tolist()))
            workingdf[dummycolname] = ''
            tmp = aggfunc( workingdf.groupby(dummycolname) )

        # drop the index and add it back
        if i == (): tmp.reset_index(drop=True,inplace=True)
        else: tmp.reset_index(inplace=True)

        for j in grpby: 
            if j not in tmp: # if column is not in DataFrame add it
                tmp[j] = '(All)'

        # new list with all columns including aggregate ones; do this only once
        if idx == 0:
            finalcols = grpby[:]
            addlcols = [k for k in tmp if k not in grpby] # aggregate columns 

        # reorder columns 
        tmp = tmp[finalcols] 

        # creation of final DataFrame
        if idx == 0:
            final = tmp; del tmp
            final = pd.concat( [final,tmp] ); del tmp

    del workingdf

    final = final.sort(columns=finalcols)

    return final

def agg(grpbyobj):
    ''' the purpose of this function is to: 
          specify aggregate operation(s) you wish to perform, 
          name the resulting column(s) in the final DataFrame.
    tmp = pd.DataFrame()
    tmp['Total (n)'] = grpbyobj['Total'].sum()
    return tmp

if __name__ == '__main__':
    df = pd.DataFrame({'Area':['a','a','b',],
    final = grouper(df,grpby=['Area','Year'],aggfunc=agg)

    # test against expected result 
    expected = '''{"Area":{"0":"(All)","1":"a","2":"b"},
                   "Total (n)":{"0":15,"1":9,"2":6}}'''
    testfinal = pd.read_json(expected)
    testfinal = testfinal[final.columns.tolist()] # reorder columns 
        # check_names kwarg True: compare indexes and columns 
    except AssertionError as e:
created: 2015-11-18 | updated: 2015-11-19

Comments are now closed for this post.