validium validators for pandas dataframes


Keywords
validation, dataframe, pandas, validium, rambda
License
MIT
Install
pip install validframe==0.2.8

Documentation

🖼 validframe

PyPI version Build Status Coverage Status

validium validators for pandas dataframes

Quick install

pip install validframe

Basic usage

Need some faith in those frames? Let's dive in.

Predefined validators

Out-of-the-box you get a set of validator factories to handle the considerably more common ways to validate dataframes:

import pandas as pd
import numpy as np

df = pd.DataFrame(
  columns = ['like_counts','comment'], # headers
  data = [
    [42, 'hello world'], # row 0
    [100000, '😆'], # row 1
    [123456, 'lol'], # row 2
    [987, "you're the baz"] # row 3
  ])


validators = [
  vf.frame.not_empty(), # frame must be not empty
  vf.frame.empty(), # frame must be empty
  vf.frame.rows(4), # frame must have 4 rows
  vf.frame.rows(100), # frame must have 100 rows
  vf.frame.cols(2), # frame must have 2 cols

  vf.rows.uniq(), # rows must be unique

  vf.cells.all_is(str, cols=['comment']), # all cells must be instances of <str>
  vf.cells.all_eq(1, cols=['like_counts']), # all cells must equal 1
  vf.cells.all_gt(0, cols=['like_counts']), # all cells must be greater than 0
  vf.cells.all_lt(0, cols=['like_counts']), # all cells must be less than 0
  vf.cells.all_gte(0, cols=['like_counts']), # all cells must be greater than or equal to 0
  vf.cells.all_lte(0, cols=['like_counts']), # all cells must be less than or equal to 0

  vf.cells.some_eq(42, cols=['like_counts']), # some cells must equal 42
  vf.cells.some_is(np.nan, cols=['comment']), # some cells must be instances of <numpy.nan>
  vf.cells.some_gt(100000, cols=['like_counts']), # some cells must be greater than 100000
  vf.cells.some_lt(987, cols=['like_counts']), # some cells must be less than 987
  vf.cells.some_gte(100000, cols=['like_counts']), # some cells must be greater than or equal to 100000
  vf.cells.some_lte(987, cols=['like_counts']), # some cells must be less than or equal to 987

  vf.cells.none_eq(0, cols=['like_counts']), # no cells must equal 0
  vf.cells.none_is(str, cols=['like_counts']), # no cells must be instances of <str>
  vf.cells.none_gt(100000, cols=['like_counts']), # no cells must be greater than 100000
  vf.cells.none_lt(42, cols=['like_counts']), # no cells must be less than 42
  vf.cells.none_gte(100000, cols=['like_counts']), # no cells must be greater than or equal to 100000
  vf.cells.none_lte(42, cols=['like_counts']), # no cells must be less than or equal to 42   

  vf.cells.some_or_none_is(str, cols=['comment']), # some or no cells must be instances of <str>
  vf.cells.some_or_none_eq(0, cols=['like_counts']), # some or no cells must equal 0
  vf.cells.some_or_none_gt(0, cols=['like_counts']), # some or no cells must be greater than 0
  vf.cells.some_or_none_lt(0, cols=['like_counts']), # some or no cells must be less than 0
  vf.cells.some_or_none_gte(0, cols=['like_counts']), # some or no cells must be greater than or equal to 0
  vf.cells.some_or_none_lte(0, cols=['like_counts']), # some or no cells must be less than or equal to 0

  vf.cells.all_or_none_is(str, cols=['comment']), # all or no cells must be instances of <str>
  vf.cells.all_or_none_eq(42, cols=['like_counts']), # all or no cells must equal 42
  vf.cells.all_or_none_gt(100000, cols=['like_counts']), # all or no cells must be greater than 100000
  vf.cells.all_or_none_lt(987, cols=['like_counts']), # all or no cells must be less than 987
  vf.cells.all_or_none_gte(100000, cols=['like_counts']), # all or no cells must be greater than or equal to 100000
  vf.cells.all_or_none_lte(987, cols=['like_counts']), # all or no cells must be less than or equal to 987

  vf.cells.all_or_some_is(str, cols=['comment']), # all or some cells must be instances of <str>
  vf.cells.all_or_some_eq(0, cols=['like_counts']), # all or some cells must equal 0
  vf.cells.all_or_some_gt(100000, cols=['like_counts']), # all or some cells must be greater than 100000
  vf.cells.all_or_some_lt(42, cols=['like_counts']), # all or some cells must be less than 42
  vf.cells.all_or_some_gte(100000, cols=['like_counts']), # all or some cells must be greater than or equal to 100000
  vf.cells.all_or_some_lte(42, cols=['like_counts']), # all or some cells must be less than or equal to 42   

  vf.cells.sum_eq(-1, cols=['like_counts']), # all cells summed must equal -1
  vf.cells.sum_gt(0, cols=['like_counts']), # all cells summed must be greater than 0
  vf.cells.sum_lt(0, cols=['like_counts']), # all cells summed must be less than 0
  vf.cells.sum_gte(0, cols=['like_counts']), # all cells must be greater than or equal to 0
  vf.cells.sum_lte(0, cols=['like_counts']), # all cells must be less than or equal to 0

  vf.cells.uniq(cols=['comments']) # all cells must be unique
]

for v in validators:
  try:
    v.validate(df)
  except AssertionError as err
    print(err)

# AssertionError: frame must be empty
# AssertionError: frame must have 100 rows
# AssertionError: (cols=['like_counts']) all cells must equal 1
# AssertionError: (cols=['like_counts']) all cells must be less than 0
# AssertionError: (cols=['like_counts']) all cells must be less than or equal 0
# AssertionError: (cols=['comment']) some cells must be instances of <numpy.nan>
# AssertionError: (cols=['like_counts']) some cells must be greater than 100000
# AssertionError: (cols=['like_counts']) some cells must be less than 987
# AssertionError: (cols=['like_counts']) no cells must be greater than or equal to 100000
# AssertionError: (cols=['like_counts']) no cells must be less than or equal to 42
# AssertionError: (cols=['comment']) some or no cells must be instances of <str>
# AssertionError: (cols=['like_counts']) some or no cells must be greater than 0
# AssertionError: (cols=['like_counts']) some or no cells must be greater than or equal to 0
# AssertionError: (cols=['like_counts']) all or no cells must equal 42
# AssertionError: (cols=['like_counts']) all or no cells must be greater than or equal to 100000
# AssertionError: (cols=['like_counts']) all or no cells must be less than or equal to 987
# AssertionError: (cols=['like_counts']) all or some cells must equal 0
# AssertionError: (cols=['like_counts']) all or some cells must be greater than 100000
# AssertionError: (cols=['like_counts']) all or some cells must be less than 42
# AssertionError: (cols=['like_counts']) all cells summed must be less than 0 

Not quite exhaustive, but enough to cover basic use.

Think there are some other common validators that are missing here? Proposals via issues and PRs are welcomed 👍

More advanced usage

Custom validators

When none of the predefined validators can do the trick, well its time to roll up your sleeves and create your own validator.

For starters you can create a CellsValidator to validate dataframes by their cells:

import validframe as vf

df = pd.DataFrame(
  columns: ['like_counts','comment'], # headers
  data: [
    [42, 'hello world'], # row 0
    [100000, '😆'], # row 1
    [123456, 'lol'], # row 2
    [987, 'earth is definitely flat'] # row 3
  ])

alotta_likes_validator = vf.CellsValidator(
  lambda xs: all([x >= 1000 for x in xs]),
  'all like counts must be atleast 1000'
  cols=['like_counts']
)

alotta_likes_validator.validate(df) # AssertionError: all likes must be atleast 1000

You can also create a RowsValidator to validate dataframes by their rows:

df = pd.DataFrame(
  columns: ['date', 'total', 'subtotal', 'tax'], # headers\
  data: [
    ['2020-01-11', 108.25, 100, 8.25], 
    ['2010-01-11', 106, 100, 6], 
    ['2009-01-11', 104.50, 100, 4.50] 
  ])

total_validator = vf.RowsValidator(
  lambda rows: all([row['total'] == row['sub_total'] + row['tax'] for row in rows]),
  'all rows must have total equal the sub-total plus tax',
  cols=['total', 'sub_total', 'tax']
)

total_validator.validate(df) # pass

If you really enjoy pandas then you might prefer to create a FrameValidator to validate dataframes utilizing pandas and numpy to write the logic:

import pandas as pd
import numpy as np

ledger_df = pd.DataFrame(
  columns = ['company', 'balance'],
  data = [
    ['Google', 100000], 
    ['Google', -90000], 
    ['Netflix', -10000], # will be unbalanced
    ['Amazon', 0], 
    ['Google', -10000], 
  ]
)

def is_balanced_by_company(df):
  pivot_df = df.pivot_table(values='balance', columns=['company'], aggfunc=np.sum)
  return pivot_df[pivot_df == 0].count().sum() == 0

balanced_validator = vf.FrameValidator(
  is_balanced_by_company,
  'sum of balances for every company must equals 0'
)

balanced_validator.validate(ledger_df) # AssertionError: sum of balances for every company must equals 0

Go functional

As with validium validators in general, using a functional programming library like ramda can add brevity and readability to the code for your validation logic.

import ramda as R

# same as above
all_gt_zero_validator = vf.CellsValidator(
  R.all(lambda x: x>0),
  'all cells must be greater than 0'
  cols=['a']
)

This is especially true when your validation logic start to become a bit more complex:

sum_numbers_eq_zero_validator = vf.CellsValidator(
  R.compose(R.equals(0), R.sum, R.filter(lambda x: isinstance(x, Number)),
  'all cells that are numbers summed must be greater than 0'
  cols=['credit', 'debit']
)

Max flexibility

Another recommendation would be to use a function instead of a lambda when your validation logic can't be expressed comfortably as a onliner, eg. your logic involves making a request to a web API:

import pandas as pd
import request

def match_remote_checksums(df):
  checksums = request.get(REMOTE_CHECKSUM_URL) # just imagine
  remote_df = pd.DataFrame({'checksum': checksums})
  return df.equals(remote_df)

# as a oneliner:
# match_remote_checksums = lambda df: pd.DataFrame({'checksum': request.get(REMOTE_CHECKSUM_URL)}).equals(df)

validator = vf.FrameValidator(
  match_remote_checksums, 
  'checksums must match the set from the server', 
  cols=['checksum']
)