{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Why do method chaining? Because data cleaning is essentially a graph. \n",
    "\n",
    "Instead of jumping back and forth, it is easier if all cleaning of one dataset happens in one place. However, due to the lack of easy-to-use custom methods, it is cumbersome."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Basic cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from janitor import then"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from functools import partial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_avocados = pd.read_csv('avocado-prices.zip', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>AveragePrice</th>\n",
       "      <th>Total Volume</th>\n",
       "      <th>4046</th>\n",
       "      <th>4225</th>\n",
       "      <th>4770</th>\n",
       "      <th>Total Bags</th>\n",
       "      <th>Small Bags</th>\n",
       "      <th>Large Bags</th>\n",
       "      <th>XLarge Bags</th>\n",
       "      <th>type</th>\n",
       "      <th>year</th>\n",
       "      <th>region</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>2017-04-23</td>\n",
       "      <td>2.08</td>\n",
       "      <td>3223.81</td>\n",
       "      <td>0.00</td>\n",
       "      <td>109.87</td>\n",
       "      <td>0.00</td>\n",
       "      <td>3113.94</td>\n",
       "      <td>958.29</td>\n",
       "      <td>2155.65</td>\n",
       "      <td>0.0</td>\n",
       "      <td>organic</td>\n",
       "      <td>2017</td>\n",
       "      <td>Syracuse</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>2017-03-12</td>\n",
       "      <td>1.19</td>\n",
       "      <td>34526.66</td>\n",
       "      <td>3661.47</td>\n",
       "      <td>3665.38</td>\n",
       "      <td>4.91</td>\n",
       "      <td>27194.90</td>\n",
       "      <td>162.44</td>\n",
       "      <td>27032.46</td>\n",
       "      <td>0.0</td>\n",
       "      <td>organic</td>\n",
       "      <td>2017</td>\n",
       "      <td>Portland</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>2015-06-28</td>\n",
       "      <td>1.16</td>\n",
       "      <td>135739.94</td>\n",
       "      <td>78066.95</td>\n",
       "      <td>19522.69</td>\n",
       "      <td>866.73</td>\n",
       "      <td>37283.57</td>\n",
       "      <td>29903.31</td>\n",
       "      <td>7380.26</td>\n",
       "      <td>0.0</td>\n",
       "      <td>conventional</td>\n",
       "      <td>2015</td>\n",
       "      <td>Jacksonville</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>2017-04-09</td>\n",
       "      <td>1.24</td>\n",
       "      <td>26121.58</td>\n",
       "      <td>11723.26</td>\n",
       "      <td>530.28</td>\n",
       "      <td>0.00</td>\n",
       "      <td>13868.04</td>\n",
       "      <td>13741.99</td>\n",
       "      <td>126.05</td>\n",
       "      <td>0.0</td>\n",
       "      <td>organic</td>\n",
       "      <td>2017</td>\n",
       "      <td>DallasFtWorth</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>2017-09-03</td>\n",
       "      <td>1.50</td>\n",
       "      <td>7148.98</td>\n",
       "      <td>222.83</td>\n",
       "      <td>202.30</td>\n",
       "      <td>0.00</td>\n",
       "      <td>6723.85</td>\n",
       "      <td>6707.19</td>\n",
       "      <td>16.66</td>\n",
       "      <td>0.0</td>\n",
       "      <td>organic</td>\n",
       "      <td>2017</td>\n",
       "      <td>NewOrleansMobile</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Date  AveragePrice  Total Volume      4046      4225    4770  \\\n",
       "36  2017-04-23          2.08       3223.81      0.00    109.87    0.00   \n",
       "42  2017-03-12          1.19      34526.66   3661.47   3665.38    4.91   \n",
       "26  2015-06-28          1.16     135739.94  78066.95  19522.69  866.73   \n",
       "38  2017-04-09          1.24      26121.58  11723.26    530.28    0.00   \n",
       "17  2017-09-03          1.50       7148.98    222.83    202.30    0.00   \n",
       "\n",
       "    Total Bags  Small Bags  Large Bags  XLarge Bags          type  year  \\\n",
       "36     3113.94      958.29     2155.65          0.0       organic  2017   \n",
       "42    27194.90      162.44    27032.46          0.0       organic  2017   \n",
       "26    37283.57    29903.31     7380.26          0.0  conventional  2015   \n",
       "38    13868.04    13741.99      126.05          0.0       organic  2017   \n",
       "17     6723.85     6707.19       16.66          0.0       organic  2017   \n",
       "\n",
       "              region  \n",
       "36          Syracuse  \n",
       "42          Portland  \n",
       "26      Jacksonville  \n",
       "38     DallasFtWorth  \n",
       "17  NewOrleansMobile  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_avocados.sample(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using `janitor.then()`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_yearly_sum_by_PID(df, PID):\n",
    "    output = (\n",
    "        df\n",
    "        [['year', str(PID)]]\n",
    "        .groupby(['year'], as_index=False)\n",
    "        .agg({\n",
    "            str(PID): 'sum'\n",
    "        })\n",
    "        .sort_values('year')\n",
    "    )\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from janitor import then\n",
    "from functools import partial\n",
    "\n",
    "df_by_pid = (\n",
    "    raw_avocados\n",
    "    .then(partial(get_yearly_sum_by_PID, PID=4770))\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>4770</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2015</td>\n",
       "      <td>1.427724e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2016</td>\n",
       "      <td>1.598798e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2017</td>\n",
       "      <td>9.121751e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2018</td>\n",
       "      <td>2.293259e+07</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   year          4770\n",
       "0  2015  1.427724e+08\n",
       "1  2016  1.598798e+08\n",
       "2  2017  9.121751e+07\n",
       "3  2018  2.293259e+07"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_by_pid"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comparison with normal pandas\n",
    "\n",
    "What I would do if there is no pyjanitor?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[janitor.remove_columns](https://pyjanitor.readthedocs.io/reference/janitor.functions/janitor.remove_columns.html#janitor.remove_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drop_cols = ['Small Bags', 'Large Bags', 'XLarge Bags']\n",
    "# pandas style\n",
    "df_no_bags_pd = raw_avocados.drop(drop_cols, axis=1)\n",
    "# pyjanitor style\n",
    "df_no_bags = raw_avocados.remove_columns(drop_cols)\n",
    "df_no_bags.equals(df_no_bags_pd)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[janitor.to_datetime](https://pyjanitor.readthedocs.io/reference/janitor.functions/janitor.to_datetime.html#janitor.to_datetime)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pandas style\n",
    "df_dt = raw_avocados.assign(Date=lambda _df: pd.to_datetime(_df['Date']))\n",
    "# pyjanitor style\n",
    "df_dt2 = raw_avocados.to_datetime('Date')\n",
    "df_dt.equals(df_dt2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Date    datetime64[ns]\n",
       "dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_dt[['Date']].dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[janitor.dropnotnull](https://pyjanitor.readthedocs.io/reference/janitor.functions/janitor.dropnotnull.html#janitor.dropnotnull)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "nan = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df = pd.DataFrame({\n",
    "    'a': [1, nan, 3],\n",
    "    'b': ['x', 'y', 'z']\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>x</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>z</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     a  b\n",
       "0  1.0  x\n",
       "1  NaN  y\n",
       "2  3.0  z"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_out1 = test_df.dropnotnull('a')\n",
    "test_out2 = test_df[lambda _df: pd.isnull(_df['a'])]\n",
    "test_out1.equals(test_out2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom chaining function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas_flavor as pf"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "@pf.register_dataframe_method\n",
    "def then(df: pd.DataFrame, func, *args, **kwargs) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Add an arbitrary function to run in the ``pyjanitor`` method chain.\n",
    "\n",
    "    This method does not mutate the original DataFrame.\n",
    "\n",
    "    :param df: A pandas dataframe.\n",
    "    :param func: A function you would like to run in the method chain.\n",
    "        It should take one parameter and return one parameter, each being the\n",
    "        DataFrame object. After that, do whatever you want in the middle.\n",
    "        Go crazy.\n",
    "    :returns: A pandas DataFrame.\n",
    "    \"\"\"\n",
    "    df = func(df, *args, **kwargs)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Use with other package e.g. great_expectations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import great_expectations as ge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "ge_avocados = ge.read_csv('avocado.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>4046</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2015</td>\n",
       "      <td>1.709450e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2016</td>\n",
       "      <td>1.525123e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2017</td>\n",
       "      <td>1.652038e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2018</td>\n",
       "      <td>4.604997e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   year          4046\n",
       "0  2015  1.709450e+09\n",
       "1  2016  1.525123e+09\n",
       "2  2017  1.652038e+09\n",
       "3  2018  4.604997e+08"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ge = (\n",
    "    ge_avocados\n",
    "    .then(partial(get_yearly_sum_by_PID, PID=4046))\n",
    ")\n",
    "df_ge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(df_ge)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}