From 6b92016476f92e8c29259ba0bd2d397a53e7125d Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Thu, 17 May 2018 09:39:42 -0500 Subject: [PATCH] BUG Don't calculate levels if we aren't expanding (#39) If a user has excluded a column because they know that column has too many levels, they shouldn't then get a warning about that column. Make sure we don't count the levels in columns which are going to be dropped anyway. This fix also makes the `_cols_to_expand` attribute make more sense -- it no longer contains columns which we aren't going to expand. --- CHANGELOG.md | 6 +++++- civismlext/preprocessing.py | 2 ++ civismlext/test/test_preprocessing.py | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59f1521..5dbc25e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). ## Unreleased - +### Fixed +- In ``DataFrameETL``, don't check for levels to expand in columns which + are slated to be dropped. This will avoid raising a warning for too + many levels in a column if the user has intentionally excluded + that column (#39). ## [0.1.8] - 2018-04-19 ### Fixed diff --git a/civismlext/preprocessing.py b/civismlext/preprocessing.py index 24476dc..fa2e8ef 100644 --- a/civismlext/preprocessing.py +++ b/civismlext/preprocessing.py @@ -241,6 +241,8 @@ def fit(self, X, y=None): else: self._cols_to_expand = [c for c in self.cols_to_expand if c in X.columns] + self._cols_to_expand = [c for c in self._cols_to_expand if + c not in self._cols_to_drop] log.debug("There are %d column(s) to expand.", len(self._cols_to_expand)) # Update sentinels if the defaults are in the dataframe diff --git a/civismlext/test/test_preprocessing.py b/civismlext/test/test_preprocessing.py index 15c2fef..3783e1f 100644 --- a/civismlext/test/test_preprocessing.py +++ b/civismlext/test/test_preprocessing.py @@ -291,6 +291,17 @@ def test_create_col_names_numeric(data_raw): assert unexpanded == ['pid', 'fruits', 'age'] +def test_dropped_cols_no_levels(data_raw): + # If the user requests that we drop a column, we shouldn't create + # levels for it. That risks raising a warning for too many levels + # when it doesn't matter. + expander = DataFrameETL(cols_to_drop=['pid']) + expander.fit(data_raw) + + assert 'animal' in expander.levels_ + assert 'pid' not in expander.levels_ + + def test_expand_col(data_raw): expander = DataFrameETL(cols_to_drop=['fruits'], dummy_na=True,