diff --git a/.gitignore b/.gitignore index 1d3379d..2e9d7ce 100644 --- a/.gitignore +++ b/.gitignore @@ -83,3 +83,7 @@ doc/cdoc/build ehthumbs.db Icon? Thumbs.db + +# Test generated files # +######################## +.python-version diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1c4cac6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,23 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: fix-byte-order-marker + - id: destroyed-symlinks + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + - id: name-tests-test + args: ["--pytest-test-first"] + - id: pretty-format-json + args: ["--autofix", "--no-ensure-ascii"] + exclude: ".ipynb" + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.7.3 + hooks: + - id: ruff-format + types_or: [ python, pyi, jupyter ] diff --git a/TODO b/TODO index 60a701e..2b18b11 100644 --- a/TODO +++ b/TODO @@ -104,7 +104,7 @@ the cheap trick way of doing it is: def arima(n, m): return ArimaModelType(n, m) and then in the factor type sniffing code detect these things and -separate them out from "real" factors. +separate them out from "real" factors. * make sure that pickling works - And make sure that if we allow it at all, then it's sustainable! diff --git a/doc/R-comparison.rst b/doc/R-comparison.rst index 8e74f49..fb5905c 100644 --- a/doc/R-comparison.rst +++ b/doc/R-comparison.rst @@ -105,7 +105,7 @@ Differences from R: # R: > qr(model.matrix(~ 1 + a:b))$rank [1] 4 - + However, the matrix produced for this formula has 5 columns, meaning that it contains redundant overspecification: @@ -149,7 +149,7 @@ Differences from R: use a full-rank encoding for ``b``. Therefore, we *should* use a full-rank encoding for ``b``, and produce a model matrix with 6 columns. But in fact, R gives us only 4: - + .. code-block:: rconsole # R: diff --git a/doc/_examples/example_lm.py b/doc/_examples/example_lm.py index eb56afc..4f85a35 100644 --- a/doc/_examples/example_lm.py +++ b/doc/_examples/example_lm.py @@ -1,9 +1,11 @@ import numpy as np from patsy import dmatrices, build_design_matrices + class LM(object): """An example ordinary least squares linear model class, analogous to R's lm() function. Don't use this in real life, it isn't properly tested.""" + def __init__(self, formula_like, data={}): y, x = dmatrices(formula_like, data, 1) self.nobs = x.shape[0] @@ -12,27 +14,27 @@ def __init__(self, formula_like, data={}): self._x_design_info = x.design_info def __repr__(self): - summary = ("Ordinary least-squares regression\n" - " Model: %s ~ %s\n" - " Regression (beta) coefficients:\n" - % (self._y_design_info.describe(), - self._x_design_info.describe())) + summary = ( + "Ordinary least-squares regression\n" + " Model: %s ~ %s\n" + " Regression (beta) coefficients:\n" + % (self._y_design_info.describe(), self._x_design_info.describe()) + ) for name, value in zip(self._x_design_info.column_names, self.betas): summary += " %s: %0.3g\n" % (name, value[0]) return summary def predict(self, new_data): - (new_x,) = build_design_matrices([self._x_design_info], - new_data) + (new_x,) = build_design_matrices([self._x_design_info], new_data) return np.dot(new_x, self.betas) def loglik(self, new_data): - (new_y, new_x) = build_design_matrices([self._y_design_info, - self._x_design_info], - new_data) + (new_y, new_x) = build_design_matrices( + [self._y_design_info, self._x_design_info], new_data + ) new_pred = np.dot(new_x, self.betas) sigma2 = self.rss / self.nobs # It'd be more elegant to use scipy.stats.norm.logpdf here, but adding # a dependency on scipy makes the docs build more complicated: Z = -0.5 * np.log(2 * np.pi * sigma2) - return Z + -0.5 * (new_y - new_x) ** 2/sigma2 + return Z + -0.5 * (new_y - new_x) ** 2 / sigma2 diff --git a/doc/_examples/example_treatment.py b/doc/_examples/example_treatment.py index ddc88d5..387c4d9 100644 --- a/doc/_examples/example_treatment.py +++ b/doc/_examples/example_treatment.py @@ -1,18 +1,26 @@ import numpy as np + class MyTreat(object): def __init__(self, reference=0): self.reference = reference def code_with_intercept(self, levels): - return ContrastMatrix(np.eye(len(levels)), - ["[My.%s]" % (level,) for level in levels]) + return ContrastMatrix( + np.eye(len(levels)), ["[My.%s]" % (level,) for level in levels] + ) def code_without_intercept(self, levels): eye = np.eye(len(levels) - 1) - contrasts = np.vstack((eye[:self.reference, :], - np.zeros((1, len(levels) - 1)), - eye[self.reference:, :])) - suffixes = ["[MyT.%s]" % (level,) for level in - levels[:self.reference] + levels[self.reference + 1:]] + contrasts = np.vstack( + ( + eye[: self.reference, :], + np.zeros((1, len(levels) - 1)), + eye[self.reference :, :], + ) + ) + suffixes = [ + "[MyT.%s]" % (level,) + for level in levels[: self.reference] + levels[self.reference + 1 :] + ] return ContrastMatrix(contrasts, suffixes) diff --git a/doc/_static/facebox.css b/doc/_static/facebox.css index 3f33b9f..4cacbac 100644 --- a/doc/_static/facebox.css +++ b/doc/_static/facebox.css @@ -77,4 +77,4 @@ .facebox_overlayBG { background-color: #000; z-index: 99; -} \ No newline at end of file +} diff --git a/doc/_static/show-code.js b/doc/_static/show-code.js index fbff113..7bd102c 100644 --- a/doc/_static/show-code.js +++ b/doc/_static/show-code.js @@ -25,13 +25,13 @@ function scrapeText(codebox){ return newlines.join('\\n'); } -$(document).ready( +$(document).ready( function() { // grab all code boxes var ipythoncode = $(".highlight-ipython"); $.each(ipythoncode, function() { var code = scrapeText($(this).text()); - // give them a facebox pop-up with plain text code + // give them a facebox pop-up with plain text code $(this).append('View Code'); $(this,"textarea").select(); }); diff --git a/doc/categorical-coding.rst b/doc/categorical-coding.rst index f470616..8c0a6a4 100644 --- a/doc/categorical-coding.rst +++ b/doc/categorical-coding.rst @@ -78,7 +78,7 @@ As an example, here's a simplified version of the built-in :class:`Treatment` coding object: .. literalinclude:: _examples/example_treatment.py - + .. ipython:: python :suppress: diff --git a/doc/conf.py b/doc/conf.py index c9fffff..2ad5d64 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,24 +1,26 @@ -# -*- coding: utf-8 -*- - # General information about the project. -project = 'patsy' -copyright = '2011-2015, Nathaniel J. Smith' +project = "patsy" +copyright = "2011-2015, Nathaniel J. Smith" import sys + print("python exec:", sys.executable) print("sys.path:", sys.path) try: import numpy + print("numpy: %s, %s" % (numpy.__version__, numpy.__file__)) except ImportError: print("no numpy") try: import matplotlib + print("matplotlib: %s, %s" % (matplotlib.__version__, matplotlib.__file__)) except ImportError: print("no matplotlib") try: import IPython + print("ipython: %s, %s" % (IPython.__version__, IPython.__file__)) except ImportError: print("no ipython") @@ -29,8 +31,10 @@ # # The short X.Y version. import sys, os + sys.path.insert(0, os.getcwd() + "/..") import patsy + version = patsy.__version__ # The full version, including alpha/beta/rc tags. release = version @@ -52,17 +56,21 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.append(os.path.abspath('sphinxext')) +sys.path.append(os.path.abspath("sphinxext")) # -- General configuration ----------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.imgmath', - 'sphinx.ext.intersphinx', - 'IPython.sphinxext.ipython_directive', - 'IPython.sphinxext.ipython_console_highlighting', - ] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.imgmath", + "sphinx.ext.intersphinx", + "IPython.sphinxext.ipython_directive", + "IPython.sphinxext.ipython_console_highlighting", +] + # Undocumented trick: if we def setup here in conf.py, it gets called just # like an extension's setup function. @@ -71,171 +79,170 @@ def setup(app): app.add_javascript("facebox.js") app.add_stylesheet("facebox.css") + # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8' +# source_encoding = 'utf-8' # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. -#unused_docs = [] +# unused_docs = [] # List of directories, relative to source directory, that shouldn't be searched # for source files. -exclude_trees = ['_build'] +exclude_trees = ["_build"] # The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'default' +html_theme = "default" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_use_modindex = True +# html_use_modindex = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = '' +# html_file_suffix = '' # Output file base name for HTML help builder. -htmlhelp_basename = 'patsydoc' +htmlhelp_basename = "patsydoc" # -- Options for LaTeX output -------------------------------------------------- # The paper size ('letter' or 'a4'). -#latex_paper_size = 'letter' +# latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). -#latex_font_size = '10pt' +# latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'patsy.tex', u'patsy Documentation', - u'Nathaniel J. Smith', 'manual'), + ("index", "patsy.tex", "patsy Documentation", "Nathaniel J. Smith", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # Additional stuff for the LaTeX preamble. -#latex_preamble = '' +# latex_preamble = '' # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_use_modindex = True +# latex_use_modindex = True # -- Custom extra options autoclass_content = "both" -intersphinx_mapping = {"python": ("http://docs.python.org", None), - "numpy": ("http://docs.scipy.org/doc/numpy", - None), - "pandas": ('http://pandas.pydata.org/pandas-docs/stable/', - None), - } +intersphinx_mapping = { + "python": ("http://docs.python.org", None), + "numpy": ("http://docs.scipy.org/doc/numpy", None), + "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None), +} autodoc_member_order = "source" diff --git a/doc/expert-model-specification.rst b/doc/expert-model-specification.rst index 16c5779..4595bac 100644 --- a/doc/expert-model-specification.rst +++ b/doc/expert-model-specification.rst @@ -251,7 +251,7 @@ Put together, it looks something like this: .. code-block:: python class MyAlternativeFactor(object): - # A factor object that simply returns the design + # A factor object that simply returns the design def __init__(self, alternative_formula, side): self.alternative_formula = alternative_formula self.side = side diff --git a/doc/formulas.rst b/doc/formulas.rst index 421c901..e644dc1 100644 --- a/doc/formulas.rst +++ b/doc/formulas.rst @@ -16,7 +16,7 @@ and interpreted. Here's the picture you'll want to keep in mind: .. figure:: figures/formula-structure.png :align: center - + The pieces that make up a formula. Say we have a formula like:: @@ -493,7 +493,7 @@ Then: more fundamental idea, that when we write: y ~ a:b - + we mean that the value of `y` can vary depending on every possible *combination* of `a` and `b`. diff --git a/doc/library-developers.rst b/doc/library-developers.rst index 478d302..1bf282c 100644 --- a/doc/library-developers.rst +++ b/doc/library-developers.rst @@ -128,9 +128,9 @@ And here's how it can be used: # Old and boring approach (but it still works): X = np.column_stack(([1] * len(data["y"]), data["x"])) LM((data["y"], X)) - + # Fancy new way: - m = LM("y ~ x", data) + m = LM("y ~ x", data) m m.predict({"x": [10, 20, 30]}) m.loglik(data) diff --git a/patsy/__init__.py b/patsy/__init__.py index 1617052..50431ec 100644 --- a/patsy/__init__.py +++ b/patsy/__init__.py @@ -10,15 +10,23 @@ # Do this first, to make it easy to check for warnings while testing: import os + if os.environ.get("PATSY_FORCE_NO_WARNINGS"): import warnings + warnings.filterwarnings("error", module="^patsy") - warnings.filterwarnings("ignore", "is_categorical_dtype is deprecated", DeprecationWarning, module="^patsy") + warnings.filterwarnings( + "ignore", + "is_categorical_dtype is deprecated", + DeprecationWarning, + module="^patsy", + ) del warnings del os import patsy.origin + class PatsyError(Exception): """This is the main error type raised by Patsy functions. @@ -35,6 +43,7 @@ class PatsyError(Exception): ``.message`` and ``.origin`` attributes directly. (The latter may be None.) """ + def __init__(self, message, origin=None): Exception.__init__(self, message) self.message = message @@ -45,8 +54,7 @@ def __str__(self): if self.origin is None: return self.message else: - return ("%s\n%s" - % (self.message, self.origin.caretize(indent=4))) + return "%s\n%s" % (self.message, self.origin.caretize(indent=4)) def set_origin(self, origin): # This is useful to modify an exception to add origin information as @@ -60,56 +68,72 @@ def set_origin(self, origin): origin = None self.origin = origin + __all__ = ["PatsyError"] # We make a rich API available for explicit use. To see what exactly is # exported, check each module's __all__, or import this module and look at its # __all__. + def _reexport(mod): __all__.extend(mod.__all__) for var in mod.__all__: globals()[var] = getattr(mod, var) + # This used to have less copy-paste, but explicit import statements make # packaging tools like py2exe and py2app happier. Sigh. import patsy.highlevel + _reexport(patsy.highlevel) import patsy.build + _reexport(patsy.build) import patsy.constraint + _reexport(patsy.constraint) import patsy.contrasts + _reexport(patsy.contrasts) import patsy.desc + _reexport(patsy.desc) import patsy.design_info + _reexport(patsy.design_info) import patsy.eval + _reexport(patsy.eval) import patsy.origin + _reexport(patsy.origin) import patsy.state + _reexport(patsy.state) import patsy.user_util + _reexport(patsy.user_util) import patsy.missing + _reexport(patsy.missing) import patsy.splines + _reexport(patsy.splines) import patsy.mgcv_cubic_splines + _reexport(patsy.mgcv_cubic_splines) # XX FIXME: we aren't exporting any of the explicit parsing interface diff --git a/patsy/build.py b/patsy/build.py index 6f9067e..b6d6475 100644 --- a/patsy/build.py +++ b/patsy/build.py @@ -11,14 +11,14 @@ import numpy as np from patsy import PatsyError -from patsy.categorical import (guess_categorical, - CategoricalSniffer, - categorical_to_int) -from patsy.util import (atleast_2d_column_default, - have_pandas, asarray_or_pandas, - safe_issubdtype) -from patsy.design_info import (DesignMatrix, DesignInfo, - FactorInfo, SubtermInfo) +from patsy.categorical import guess_categorical, CategoricalSniffer, categorical_to_int +from patsy.util import ( + atleast_2d_column_default, + have_pandas, + asarray_or_pandas, + safe_issubdtype, +) +from patsy.design_info import DesignMatrix, DesignInfo, FactorInfo, SubtermInfo from patsy.redundancy import pick_contrasts_for_term from patsy.eval import EvalEnvironment from patsy.contrasts import code_contrast_matrix, Treatment @@ -28,6 +28,7 @@ if have_pandas: import pandas + class _MockFactor(object): def __init__(self, name="MOCKMOCK"): self._name = name @@ -38,15 +39,19 @@ def eval(self, state, env): def name(self): return self._name + def _max_allowed_dim(dim, arr, factor): if arr.ndim > dim: - msg = ("factor '%s' evaluates to an %s-dimensional array; I only " - "handle arrays with dimension <= %s" - % (factor.name(), arr.ndim, dim)) + msg = ( + "factor '%s' evaluates to an %s-dimensional array; I only " + "handle arrays with dimension <= %s" % (factor.name(), arr.ndim, dim) + ) raise PatsyError(msg, factor) + def test__max_allowed_dim(): import pytest + f = _MockFactor() _max_allowed_dim(1, np.array(1), f) _max_allowed_dim(1, np.array([1]), f) @@ -57,6 +62,7 @@ def test__max_allowed_dim(): _max_allowed_dim(2, np.array([[1]]), f) pytest.raises(PatsyError, _max_allowed_dim, 2, np.array([[[1]]]), f) + def _eval_factor(factor_info, data, NA_action): factor = factor_info.factor result = factor.eval(factor_info.state, data) @@ -65,28 +71,32 @@ def _eval_factor(factor_info, data, NA_action): result = atleast_2d_column_default(result, preserve_pandas=True) _max_allowed_dim(2, result, factor) if result.shape[1] != factor_info.num_columns: - raise PatsyError("when evaluating factor %s, I got %s columns " - "instead of the %s I was expecting" - % (factor.name(), - factor_info.num_columns, - result.shape[1]), - factor) + raise PatsyError( + "when evaluating factor %s, I got %s columns " + "instead of the %s I was expecting" + % (factor.name(), factor_info.num_columns, result.shape[1]), + factor, + ) if not safe_issubdtype(np.asarray(result).dtype, np.number): - raise PatsyError("when evaluating numeric factor %s, " - "I got non-numeric data of type '%s'" - % (factor.name(), result.dtype), - factor) + raise PatsyError( + "when evaluating numeric factor %s, " + "I got non-numeric data of type '%s'" % (factor.name(), result.dtype), + factor, + ) return result, NA_action.is_numerical_NA(result) # returns either a 1d ndarray or a pandas.Series, plus is_NA mask else: assert factor_info.type == "categorical" - result = categorical_to_int(result, factor_info.categories, NA_action, - origin=factor_info.factor) + result = categorical_to_int( + result, factor_info.categories, NA_action, origin=factor_info.factor + ) assert result.ndim == 1 return result, np.asarray(result == -1) + def test__eval_factor_numerical(): import pytest + naa = NAAction() f = _MockFactor() @@ -102,11 +112,8 @@ def test__eval_factor_numerical(): pytest.raises(PatsyError, _eval_factor, fi1, {"mock": [[1, 2]]}, naa) pytest.raises(PatsyError, _eval_factor, fi1, {"mock": ["a", "b"]}, naa) pytest.raises(PatsyError, _eval_factor, fi1, {"mock": [True, False]}, naa) - fi2 = FactorInfo(_MockFactor(), "numerical", - {}, num_columns=2, categories=None) - eval123321, is_NA = _eval_factor(fi2, - {"mock": [[1, 3], [2, 2], [3, 1]]}, - naa) + fi2 = FactorInfo(_MockFactor(), "numerical", {}, num_columns=2, categories=None) + eval123321, is_NA = _eval_factor(fi2, {"mock": [[1, 3], [2, 2], [3, 1]]}, naa) assert eval123321.shape == (3, 2) assert np.all(eval123321 == [[1, 3], [2, 2], [3, 1]]) assert is_NA.shape == (3,) @@ -114,79 +121,84 @@ def test__eval_factor_numerical(): pytest.raises(PatsyError, _eval_factor, fi2, {"mock": [1, 2, 3]}, naa) pytest.raises(PatsyError, _eval_factor, fi2, {"mock": [[1, 2, 3]]}, naa) - ev_nan, is_NA = _eval_factor(fi1, {"mock": [1, 2, np.nan]}, - NAAction(NA_types=["NaN"])) + ev_nan, is_NA = _eval_factor( + fi1, {"mock": [1, 2, np.nan]}, NAAction(NA_types=["NaN"]) + ) assert np.array_equal(is_NA, [False, False, True]) - ev_nan, is_NA = _eval_factor(fi1, {"mock": [1, 2, np.nan]}, - NAAction(NA_types=[])) + ev_nan, is_NA = _eval_factor(fi1, {"mock": [1, 2, np.nan]}, NAAction(NA_types=[])) assert np.array_equal(is_NA, [False, False, False]) if have_pandas: - eval_ser, _ = _eval_factor(fi1, - {"mock": - pandas.Series([1, 2, 3], - index=[10, 20, 30])}, - naa) + eval_ser, _ = _eval_factor( + fi1, {"mock": pandas.Series([1, 2, 3], index=[10, 20, 30])}, naa + ) assert isinstance(eval_ser, pandas.DataFrame) assert np.array_equal(eval_ser, [[1], [2], [3]]) assert np.array_equal(eval_ser.index, [10, 20, 30]) - eval_df1, _ = _eval_factor(fi1, - {"mock": - pandas.DataFrame([[2], [1], [3]], - index=[20, 10, 30])}, - naa) + eval_df1, _ = _eval_factor( + fi1, {"mock": pandas.DataFrame([[2], [1], [3]], index=[20, 10, 30])}, naa + ) assert isinstance(eval_df1, pandas.DataFrame) assert np.array_equal(eval_df1, [[2], [1], [3]]) assert np.array_equal(eval_df1.index, [20, 10, 30]) - eval_df2, _ = _eval_factor(fi2, - {"mock": - pandas.DataFrame([[2, 3], [1, 4], [3, -1]], - index=[20, 30, 10])}, - naa) + eval_df2, _ = _eval_factor( + fi2, + {"mock": pandas.DataFrame([[2, 3], [1, 4], [3, -1]], index=[20, 30, 10])}, + naa, + ) assert isinstance(eval_df2, pandas.DataFrame) assert np.array_equal(eval_df2, [[2, 3], [1, 4], [3, -1]]) assert np.array_equal(eval_df2.index, [20, 30, 10]) - pytest.raises(PatsyError, - _eval_factor, fi2, - {"mock": pandas.Series([1, 2, 3], index=[10, 20, 30])}, - naa) - pytest.raises(PatsyError, - _eval_factor, fi1, - {"mock": - pandas.DataFrame([[2, 3], [1, 4], [3, -1]], - index=[20, 30, 10])}, - naa) + pytest.raises( + PatsyError, + _eval_factor, + fi2, + {"mock": pandas.Series([1, 2, 3], index=[10, 20, 30])}, + naa, + ) + pytest.raises( + PatsyError, + _eval_factor, + fi1, + {"mock": pandas.DataFrame([[2, 3], [1, 4], [3, -1]], index=[20, 30, 10])}, + naa, + ) + def test__eval_factor_categorical(): import pytest from patsy.categorical import C + naa = NAAction() f = _MockFactor() - fi1 = FactorInfo(f, "categorical", - {}, num_columns=None, categories=("a", "b")) + fi1 = FactorInfo(f, "categorical", {}, num_columns=None, categories=("a", "b")) assert fi1.factor is f cat1, _ = _eval_factor(fi1, {"mock": ["b", "a", "b"]}, naa) assert cat1.shape == (3,) assert np.all(cat1 == [1, 0, 1]) pytest.raises(PatsyError, _eval_factor, fi1, {"mock": ["c"]}, naa) pytest.raises(PatsyError, _eval_factor, fi1, {"mock": C(["a", "c"])}, naa) - pytest.raises(PatsyError, _eval_factor, fi1, - {"mock": C(["a", "b"], levels=["b", "a"])}, naa) + pytest.raises( + PatsyError, _eval_factor, fi1, {"mock": C(["a", "b"], levels=["b", "a"])}, naa + ) pytest.raises(PatsyError, _eval_factor, fi1, {"mock": [1, 0, 1]}, naa) bad_cat = np.asarray(["b", "a", "a", "b"]) bad_cat.resize((2, 2)) pytest.raises(PatsyError, _eval_factor, fi1, {"mock": bad_cat}, naa) - cat1_NA, is_NA = _eval_factor(fi1, {"mock": ["a", None, "b"]}, - NAAction(NA_types=["None"])) + cat1_NA, is_NA = _eval_factor( + fi1, {"mock": ["a", None, "b"]}, NAAction(NA_types=["None"]) + ) assert np.array_equal(is_NA, [False, True, False]) assert np.array_equal(cat1_NA, [0, -1, 1]) - pytest.raises(PatsyError, _eval_factor, fi1, - {"mock": ["a", None, "b"]}, NAAction(NA_types=[])) + pytest.raises( + PatsyError, _eval_factor, fi1, {"mock": ["a", None, "b"]}, NAAction(NA_types=[]) + ) - fi2 = FactorInfo(_MockFactor(), "categorical", {}, - num_columns=None, categories=[False, True]) + fi2 = FactorInfo( + _MockFactor(), "categorical", {}, num_columns=None, categories=[False, True] + ) cat2, _ = _eval_factor(fi2, {"mock": [True, False, False, True]}, naa) assert cat2.shape == (4,) assert np.all(cat2 == [1, 0, 0, 1]) @@ -203,22 +215,27 @@ def test__eval_factor_categorical(): assert np.array_equal(cat_sbool, [1, 0]) assert np.array_equal(cat_sbool.index, [11, 21]) + def _column_combinations(columns_per_factor): # For consistency with R, the left-most item iterates fastest: iterators = [range(n) for n in reversed(columns_per_factor)] for reversed_combo in itertools.product(*iterators): yield reversed_combo[::-1] + def test__column_combinations(): - assert list(_column_combinations([2, 3])) == [(0, 0), - (1, 0), - (0, 1), - (1, 1), - (0, 2), - (1, 2)] + assert list(_column_combinations([2, 3])) == [ + (0, 0), + (1, 0), + (0, 1), + (1, 1), + (0, 2), + (1, 2), + ] assert list(_column_combinations([3])) == [(0,), (1,), (2,)] assert list(_column_combinations([])) == [()] + def _subterm_column_combinations(factor_infos, subterm): columns_per_factor = [] for factor in subterm.factors: @@ -229,17 +246,18 @@ def _subterm_column_combinations(factor_infos, subterm): columns_per_factor.append(columns) return _column_combinations(columns_per_factor) + def _subterm_column_names_iter(factor_infos, subterm): total = 0 for i, column_idxs in enumerate( - _subterm_column_combinations(factor_infos, subterm)): + _subterm_column_combinations(factor_infos, subterm) + ): name_pieces = [] for factor, column_idx in zip(subterm.factors, column_idxs): fi = factor_infos[factor] if fi.type == "numerical": if fi.num_columns > 1: - name_pieces.append("%s[%s]" - % (factor.name(), column_idx)) + name_pieces.append("%s[%s]" % (factor.name(), column_idx)) else: assert column_idx == 0 name_pieces.append(factor.name()) @@ -255,94 +273,116 @@ def _subterm_column_names_iter(factor_infos, subterm): total += 1 assert total == subterm.num_columns + def _build_subterm(subterm, factor_infos, factor_values, out): assert subterm.num_columns == out.shape[1] out[...] = 1 for i, column_idxs in enumerate( - _subterm_column_combinations(factor_infos, subterm)): + _subterm_column_combinations(factor_infos, subterm) + ): for factor, column_idx in zip(subterm.factors, column_idxs): if factor_infos[factor].type == "categorical": contrast = subterm.contrast_matrices[factor] if np.any(factor_values[factor] < 0): - raise PatsyError("can't build a design matrix " - "containing missing values", factor) - out[:, i] *= contrast.matrix[factor_values[factor], - column_idx] + raise PatsyError( + "can't build a design matrix " "containing missing values", + factor, + ) + out[:, i] *= contrast.matrix[factor_values[factor], column_idx] else: assert factor_infos[factor].type == "numerical" - assert (factor_values[factor].shape[1] - == factor_infos[factor].num_columns) + assert ( + factor_values[factor].shape[1] == factor_infos[factor].num_columns + ) out[:, i] *= factor_values[factor][:, column_idx] + def test__subterm_column_names_iter_and__build_subterm(): import pytest from patsy.contrasts import ContrastMatrix from patsy.categorical import C + f1 = _MockFactor("f1") f2 = _MockFactor("f2") f3 = _MockFactor("f3") - contrast = ContrastMatrix(np.array([[0, 0.5], - [3, 0]]), - ["[c1]", "[c2]"]) - - factor_infos1 = {f1: FactorInfo(f1, "numerical", {}, - num_columns=1, categories=None), - f2: FactorInfo(f2, "categorical", {}, - num_columns=None, categories=["a", "b"]), - f3: FactorInfo(f3, "numerical", {}, - num_columns=1, categories=None), - } + contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"]) + + factor_infos1 = { + f1: FactorInfo(f1, "numerical", {}, num_columns=1, categories=None), + f2: FactorInfo(f2, "categorical", {}, num_columns=None, categories=["a", "b"]), + f3: FactorInfo(f3, "numerical", {}, num_columns=1, categories=None), + } contrast_matrices = {f2: contrast} subterm1 = SubtermInfo([f1, f2, f3], contrast_matrices, 2) - assert (list(_subterm_column_names_iter(factor_infos1, subterm1)) - == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"]) + assert list(_subterm_column_names_iter(factor_infos1, subterm1)) == [ + "f1:f2[c1]:f3", + "f1:f2[c2]:f3", + ] mat = np.empty((3, 2)) - _build_subterm(subterm1, factor_infos1, - {f1: atleast_2d_column_default([1, 2, 3]), - f2: np.asarray([0, 0, 1]), - f3: atleast_2d_column_default([7.5, 2, -12])}, - mat) - assert np.allclose(mat, [[0, 0.5 * 1 * 7.5], - [0, 0.5 * 2 * 2], - [3 * 3 * -12, 0]]) + _build_subterm( + subterm1, + factor_infos1, + { + f1: atleast_2d_column_default([1, 2, 3]), + f2: np.asarray([0, 0, 1]), + f3: atleast_2d_column_default([7.5, 2, -12]), + }, + mat, + ) + assert np.allclose(mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]]) # Check that missing categorical values blow up - pytest.raises(PatsyError, _build_subterm, subterm1, factor_infos1, - {f1: atleast_2d_column_default([1, 2, 3]), - f2: np.asarray([0, -1, 1]), - f3: atleast_2d_column_default([7.5, 2, -12])}, - mat) + pytest.raises( + PatsyError, + _build_subterm, + subterm1, + factor_infos1, + { + f1: atleast_2d_column_default([1, 2, 3]), + f2: np.asarray([0, -1, 1]), + f3: atleast_2d_column_default([7.5, 2, -12]), + }, + mat, + ) factor_infos2 = dict(factor_infos1) - factor_infos2[f1] = FactorInfo(f1, "numerical", {}, - num_columns=2, categories=None) + factor_infos2[f1] = FactorInfo(f1, "numerical", {}, num_columns=2, categories=None) subterm2 = SubtermInfo([f1, f2, f3], contrast_matrices, 4) - assert (list(_subterm_column_names_iter(factor_infos2, subterm2)) - == ["f1[0]:f2[c1]:f3", - "f1[1]:f2[c1]:f3", - "f1[0]:f2[c2]:f3", - "f1[1]:f2[c2]:f3"]) + assert list(_subterm_column_names_iter(factor_infos2, subterm2)) == [ + "f1[0]:f2[c1]:f3", + "f1[1]:f2[c1]:f3", + "f1[0]:f2[c2]:f3", + "f1[1]:f2[c2]:f3", + ] mat2 = np.empty((3, 4)) - _build_subterm(subterm2, factor_infos2, - {f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]), - f2: np.asarray([0, 0, 1]), - f3: atleast_2d_column_default([7.5, 2, -12])}, - mat2) - assert np.allclose(mat2, [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5], - [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2], - [3 * 5 * -12, 3 * 6 * -12, 0, 0]]) - + _build_subterm( + subterm2, + factor_infos2, + { + f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]), + f2: np.asarray([0, 0, 1]), + f3: atleast_2d_column_default([7.5, 2, -12]), + }, + mat2, + ) + assert np.allclose( + mat2, + [ + [0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5], + [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2], + [3 * 5 * -12, 3 * 6 * -12, 0, 0], + ], + ) subterm_int = SubtermInfo([], {}, 1) assert list(_subterm_column_names_iter({}, subterm_int)) == ["Intercept"] mat3 = np.empty((3, 1)) - _build_subterm(subterm_int, {}, - {f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, - mat3) + _build_subterm(subterm_int, {}, {f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3) assert np.allclose(mat3, 1) + def _factors_memorize(factors, data_iter_maker, eval_env): # First, start off the memorization process by setting up each factor's # state and finding out how many passes it will need: @@ -372,6 +412,7 @@ def _factors_memorize(factors, data_iter_maker, eval_env): which_pass += 1 return factor_states + def test__factors_memorize(): class MockFactor(object): def __init__(self, requested_passes, token): @@ -396,12 +437,15 @@ def memorize_finish(self, state, which_pass): class Data(object): CHUNKS = 3 + def __init__(self): self.calls = 0 self.data = [{"chunk": i} for i in range(self.CHUNKS)] + def __call__(self): self.calls += 1 return iter(self.data) + data = Data() f0 = MockFactor(0, "f0") f1 = MockFactor(1, "f1") @@ -415,24 +459,29 @@ def __call__(self): f0: { "calls": [], "token": "f0", - }, + }, f1: { "calls": mem_chunks0 + [("memorize_finish", 0)], "token": "f1", - }, + }, f2a: { - "calls": mem_chunks0 + [("memorize_finish", 0)] - + mem_chunks1 + [("memorize_finish", 1)], + "calls": mem_chunks0 + + [("memorize_finish", 0)] + + mem_chunks1 + + [("memorize_finish", 1)], "token": "f2a", - }, + }, f2b: { - "calls": mem_chunks0 + [("memorize_finish", 0)] - + mem_chunks1 + [("memorize_finish", 1)], + "calls": mem_chunks0 + + [("memorize_finish", 0)] + + mem_chunks1 + + [("memorize_finish", 1)], "token": "f2b", - }, - } + }, + } assert factor_states == expected + def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action): num_column_counts = {} cat_sniffers = {} @@ -442,8 +491,7 @@ def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action): value = factor.eval(factor_states[factor], data) if factor in cat_sniffers or guess_categorical(value): if factor not in cat_sniffers: - cat_sniffers[factor] = CategoricalSniffer(NA_action, - factor.origin) + cat_sniffers[factor] = CategoricalSniffer(NA_action, factor.origin) done = cat_sniffers[factor].sniff(value) if done: examine_needed.remove(factor) @@ -462,12 +510,15 @@ def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action): cat_levels_contrasts[factor] = sniffer.levels_contrast() return (num_column_counts, cat_levels_contrasts) + def test__examine_factor_types(): from patsy.categorical import C + class MockFactor(object): def __init__(self): # You should check this using 'is', not '==' from patsy.origin import Origin + self.origin = Origin("MOCK", 1, 2) def eval(self, state, data): @@ -493,6 +544,7 @@ def next(self): if self.i > 1: raise StopIteration return self.i + __next__ = next num_1dim = MockFactor() @@ -507,20 +559,21 @@ def next(self): num_1dim: ([1, 2, 3], [4, 5, 6]), num_1col: ([[1], [2], [3]], [[4], [5], [6]]), num_4col: (np.zeros((3, 4)), np.ones((3, 4))), - categ_1col: (C(["a", "b", "c"], levels=("a", "b", "c"), - contrast="MOCK CONTRAST"), - C(["c", "b", "a"], levels=("a", "b", "c"), - contrast="MOCK CONTRAST")), + categ_1col: ( + C(["a", "b", "c"], levels=("a", "b", "c"), contrast="MOCK CONTRAST"), + C(["c", "b", "a"], levels=("a", "b", "c"), contrast="MOCK CONTRAST"), + ), bool_1col: ([True, True, False], [False, True, True]), # It has to read through all the data to see all the possible levels: string_1col: (["a", "a", "a"], ["c", "b", "a"]), object_1col: ([object_levels[0]] * 3, object_levels), - } + } it = DataIterMaker() - (num_column_counts, cat_levels_contrasts, - ) = _examine_factor_types(factor_states.keys(), factor_states, it, - NAAction()) + ( + num_column_counts, + cat_levels_contrasts, + ) = _examine_factor_types(factor_states.keys(), factor_states, it, NAAction()) assert it.i == 2 iterations = 0 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4} @@ -529,20 +582,21 @@ def next(self): bool_1col: ((False, True), None), string_1col: (("a", "b", "c"), None), object_1col: (tuple(sorted(object_levels, key=id)), None), - } + } # Check that it doesn't read through all the data if that's not necessary: it = DataIterMaker() no_read_necessary = [num_1dim, num_1col, num_4col, categ_1col, bool_1col] - (num_column_counts, cat_levels_contrasts, - ) = _examine_factor_types(no_read_necessary, factor_states, it, - NAAction()) + ( + num_column_counts, + cat_levels_contrasts, + ) = _examine_factor_types(no_read_necessary, factor_states, it, NAAction()) assert it.i == 0 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4} assert cat_levels_contrasts == { categ_1col: (("a", "b", "c"), "MOCK CONTRAST"), bool_1col: ((False, True), None), - } + } # Illegal inputs: bool_3col = MockFactor() @@ -555,21 +609,22 @@ def next(self): num_3dim: (np.zeros((3, 3, 3)), np.ones((3, 3, 3))), string_3col: ([["a", "b", "c"]], [["b", "c", "a"]]), object_3col: ([[[object()]]], [[[object()]]]), - } + } import pytest + for illegal_factor in illegal_factor_states: it = DataIterMaker() try: - _examine_factor_types([illegal_factor], illegal_factor_states, it, - NAAction()) + _examine_factor_types( + [illegal_factor], illegal_factor_states, it, NAAction() + ) except PatsyError as e: assert e.origin is illegal_factor.origin else: assert False -def _make_subterm_infos(terms, - num_column_counts, - cat_levels_contrasts): + +def _make_subterm_infos(terms, num_column_counts, cat_levels_contrasts): # Sort each term into a bucket based on the set of numeric factors it # contains: term_buckets = OrderedDict() @@ -600,9 +655,9 @@ def _make_subterm_infos(terms, used_subterms = set() for term in bucket_terms: subterm_infos = [] - factor_codings = pick_contrasts_for_term(term, - num_column_counts, - used_subterms) + factor_codings = pick_contrasts_for_term( + term, num_column_counts, used_subterms + ) # Construct one SubtermInfo for each subterm for factor_coding in factor_codings: subterm_factors = [] @@ -622,20 +677,20 @@ def _make_subterm_infos(terms, levels, contrast = cat_levels_contrasts[factor] # This is where the default coding is set to # Treatment: - coded = code_contrast_matrix(factor_coding[factor], - levels, contrast, - default=Treatment) + coded = code_contrast_matrix( + factor_coding[factor], levels, contrast, default=Treatment + ) contrast_matrices[factor] = coded subterm_columns *= coded.matrix.shape[1] - subterm_infos.append(SubtermInfo(subterm_factors, - contrast_matrices, - subterm_columns)) + subterm_infos.append( + SubtermInfo(subterm_factors, contrast_matrices, subterm_columns) + ) term_to_subterm_infos[term] = subterm_infos assert new_term_order == list(term_to_subterm_infos) return term_to_subterm_infos -def design_matrix_builders(termlists, data_iter_maker, eval_env, - NA_action="drop"): + +def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"): """Construct several :class:`DesignInfo` objects from termlists. This is one of Patsy's fundamental functions. This function and @@ -688,36 +743,38 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env, factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env) # Now all the factors have working eval methods, so we can evaluate them # on some data to find out what type of data they return. - (num_column_counts, - cat_levels_contrasts) = _examine_factor_types(all_factors, - factor_states, - data_iter_maker, - NA_action) + (num_column_counts, cat_levels_contrasts) = _examine_factor_types( + all_factors, factor_states, data_iter_maker, NA_action + ) # Now we need the factor infos, which encapsulate the knowledge of # how to turn any given factor into a chunk of data: factor_infos = {} for factor in all_factors: if factor in num_column_counts: - fi = FactorInfo(factor, - "numerical", - factor_states[factor], - num_columns=num_column_counts[factor], - categories=None) + fi = FactorInfo( + factor, + "numerical", + factor_states[factor], + num_columns=num_column_counts[factor], + categories=None, + ) else: assert factor in cat_levels_contrasts categories = cat_levels_contrasts[factor][0] - fi = FactorInfo(factor, - "categorical", - factor_states[factor], - num_columns=None, - categories=categories) + fi = FactorInfo( + factor, + "categorical", + factor_states[factor], + num_columns=None, + categories=categories, + ) factor_infos[factor] = fi # And now we can construct the DesignInfo for each termlist: design_infos = [] for termlist in termlists: - term_to_subterm_infos = _make_subterm_infos(termlist, - num_column_counts, - cat_levels_contrasts) + term_to_subterm_infos = _make_subterm_infos( + termlist, num_column_counts, cat_levels_contrasts + ) assert isinstance(term_to_subterm_infos, OrderedDict) assert frozenset(term_to_subterm_infos) == frozenset(termlist) this_design_factor_infos = {} @@ -727,14 +784,18 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env, column_names = [] for subterms in term_to_subterm_infos.values(): for subterm in subterms: - for column_name in _subterm_column_names_iter( - factor_infos, subterm): + for column_name in _subterm_column_names_iter(factor_infos, subterm): column_names.append(column_name) - design_infos.append(DesignInfo(column_names, - factor_infos=this_design_factor_infos, - term_codings=term_to_subterm_infos)) + design_infos.append( + DesignInfo( + column_names, + factor_infos=this_design_factor_infos, + term_codings=term_to_subterm_infos, + ) + ) return design_infos + def _build_design_matrix(design_info, factor_info_to_values, dtype): factor_to_values = {} need_reshape = False @@ -764,12 +825,12 @@ def _build_design_matrix(design_info, factor_info_to_values, dtype): for subterm in subterms: end_column = start_column + subterm.num_columns m_slice = m[:, start_column:end_column] - _build_subterm(subterm, design_info.factor_infos, - factor_to_values, m_slice) + _build_subterm(subterm, design_info.factor_infos, factor_to_values, m_slice) start_column = end_column assert start_column == m.shape[1] return need_reshape, m + class _CheckMatch(object): def __init__(self, name, eq_fn): self._name = name @@ -785,18 +846,21 @@ def check(self, seen_value, desc, origin): self._value_origin = origin else: if not self._eq_fn(self.value, seen_value): - msg = ("%s mismatch between %s and %s" - % (self._name, self._value_desc, desc)) + msg = "%s mismatch between %s and %s" % ( + self._name, + self._value_desc, + desc, + ) if isinstance(self.value, int): msg += " (%r versus %r)" % (self.value, seen_value) # XX FIXME: this is a case where having discontiguous Origins # would be useful... raise PatsyError(msg, origin) -def build_design_matrices(design_infos, data, - NA_action="drop", - return_type="matrix", - dtype=np.dtype(float)): + +def build_design_matrices( + design_infos, data, NA_action="drop", return_type="matrix", dtype=np.dtype(float) +): """Construct several design matrices from :class:`DesignMatrixBuilder` objects. @@ -865,11 +929,14 @@ def build_design_matrices(design_infos, data, if isinstance(NA_action, str): NA_action = NAAction(NA_action) if return_type == "dataframe" and not have_pandas: - raise PatsyError("pandas.DataFrame was requested, but pandas " - "is not installed") + raise PatsyError( + "pandas.DataFrame was requested, but pandas " "is not installed" + ) if return_type not in ("matrix", "dataframe"): - raise PatsyError("unrecognized output type %r, should be " - "'matrix' or 'dataframe'" % (return_type,)) + raise PatsyError( + "unrecognized output type %r, should be " + "'matrix' or 'dataframe'" % (return_type,) + ) # Evaluate factors factor_info_to_values = {} factor_info_to_isNAs = {} @@ -890,8 +957,7 @@ def build_design_matrices(design_infos, data, name = factor_info.factor.name() origin = factor_info.factor.origin rows_checker.check(value.shape[0], name, origin) - if (have_pandas - and isinstance(value, (pandas.Series, pandas.DataFrame))): + if have_pandas and isinstance(value, (pandas.Series, pandas.DataFrame)): index_checker.check(value.index, name, origin) # Strategy: we work with raw ndarrays for doing the actual # combining; DesignMatrixBuilder objects never sees pandas @@ -904,8 +970,7 @@ def build_design_matrices(design_infos, data, # Handle NAs values = list(factor_info_to_values.values()) is_NAs = list(factor_info_to_isNAs.values()) - origins = [factor_info.factor.origin - for factor_info in factor_info_to_values] + origins = [factor_info.factor.origin for factor_info in factor_info_to_values] pandas_index = index_checker.value num_rows = rows_checker.value # num_rows is None iff evaluator_to_values (and associated sets like @@ -927,9 +992,7 @@ def build_design_matrices(design_infos, data, # Build factor values into matrices results = [] for design_info in design_infos: - results.append(_build_design_matrix(design_info, - factor_info_to_values, - dtype)) + results.append(_build_design_matrix(design_info, factor_info_to_values, dtype)) matrices = [] for need_reshape, matrix in results: if need_reshape: @@ -939,25 +1002,27 @@ def build_design_matrices(design_infos, data, # we can figure out what that is... assert matrix.shape[0] == 1 if num_rows is not None: - matrix = DesignMatrix(np.repeat(matrix, num_rows, axis=0), - matrix.design_info) + matrix = DesignMatrix( + np.repeat(matrix, num_rows, axis=0), matrix.design_info + ) else: raise PatsyError( "No design matrix has any non-trivial factors, " "the data object is not a DataFrame. " "I can't tell how many rows the design matrix should " "have!" - ) + ) matrices.append(matrix) if return_type == "dataframe": assert have_pandas for i, matrix in enumerate(matrices): di = matrix.design_info - matrices[i] = pandas.DataFrame(matrix, - columns=di.column_names, - index=pandas_index) + matrices[i] = pandas.DataFrame( + matrix, columns=di.column_names, index=pandas_index + ) matrices[i].design_info = di return matrices + # It should be possible to do just the factors -> factor_infos stuff # alone, since that, well, makes logical sense to do. diff --git a/patsy/builtins.py b/patsy/builtins.py index 9a1e3b5..fb4b319 100644 --- a/patsy/builtins.py +++ b/patsy/builtins.py @@ -11,20 +11,26 @@ __all__ = ["I", "Q"] from patsy.contrasts import ContrastMatrix, Treatment, Poly, Sum, Helmert, Diff + __all__ += ["ContrastMatrix", "Treatment", "Poly", "Sum", "Helmert", "Diff"] from patsy.categorical import C + __all__ += ["C"] from patsy.state import center, standardize, scale + __all__ += ["center", "standardize", "scale"] from patsy.splines import bs + __all__ += ["bs"] from patsy.mgcv_cubic_splines import cr, cc, te + __all__ += ["cr", "cc", "te"] + def I(x): """The identity function. Simply returns its input unchanged. @@ -42,10 +48,12 @@ def I(x): ``x2``.""" return x + def test_I(): assert I(1) == 1 assert I(None) is None + def Q(name): """A way to 'quote' variable names, especially ones that do not otherwise meet Python's variable name rules. @@ -82,16 +90,18 @@ def Q(name): y ~ np.sqrt(Q("weight.in.kg")) """ from patsy.eval import EvalEnvironment + env = EvalEnvironment.capture(1) try: return env.namespace[name] except KeyError: raise NameError("no data named %r found" % (name,)) + def test_Q(): a = 1 assert Q("a") == 1 assert Q("Q") is Q import pytest - pytest.raises(NameError, Q, "asdfsadfdsad") + pytest.raises(NameError, Q, "asdfsadfdsad") diff --git a/patsy/categorical.py b/patsy/categorical.py index b552f42..7d5be9c 100644 --- a/patsy/categorical.py +++ b/patsy/categorical.py @@ -2,8 +2,7 @@ # Copyright (C) 2011-2013 Nathaniel Smith # See file LICENSE.txt for license information. -__all__ = ["C", "guess_categorical", "CategoricalSniffer", - "categorical_to_int"] +__all__ = ["C", "guess_categorical", "CategoricalSniffer", "categorical_to_int"] # How we handle categorical data: the big picture # ----------------------------------------------- @@ -36,21 +35,26 @@ import numpy as np from patsy import PatsyError -from patsy.util import (SortAnythingKey, - safe_scalar_isnan, - iterable, - have_pandas, have_pandas_categorical, - have_pandas_categorical_dtype, - safe_is_pandas_categorical, - pandas_Categorical_from_codes, - pandas_Categorical_categories, - pandas_Categorical_codes, - safe_issubdtype, - no_pickling, assert_no_pickling) +from patsy.util import ( + SortAnythingKey, + safe_scalar_isnan, + iterable, + have_pandas, + have_pandas_categorical, + have_pandas_categorical_dtype, + safe_is_pandas_categorical, + pandas_Categorical_from_codes, + pandas_Categorical_categories, + pandas_Categorical_codes, + safe_issubdtype, + no_pickling, + assert_no_pickling, +) if have_pandas: import pandas + # Objects of this type will always be treated as categorical, with the # specified levels and contrast (if given). class _CategoricalBox(object): @@ -61,6 +65,7 @@ def __init__(self, data, contrast, levels): __getstate__ = no_pickling + def C(data, contrast=None, levels=None): """ Marks some `data` as being categorical, and specifies how to interpret @@ -101,6 +106,7 @@ def C(data, contrast=None, levels=None): data = data.data return _CategoricalBox(data, contrast, levels) + def test_C(): c1 = C("asdf") assert isinstance(c1, _CategoricalBox) @@ -122,6 +128,7 @@ def test_C(): assert_no_pickling(c4) + def guess_categorical(data): if safe_is_pandas_categorical(data): return True @@ -132,6 +139,7 @@ def guess_categorical(data): return False return True + def test_guess_categorical(): if have_pandas_categorical: c = pandas.Categorical([1, 2, 3]) @@ -148,6 +156,7 @@ def test_guess_categorical(): assert not guess_categorical([1.0, 2.0, 3.0]) assert not guess_categorical([1.0, 2.0, 3.0, np.nan]) + def _categorical_shape_fix(data): # helper function # data should not be a _CategoricalBox or pandas Categorical or anything @@ -157,11 +166,11 @@ def _categorical_shape_fix(data): raise PatsyError("categorical data cannot be >1-dimensional") # coerce scalars into 1d, which is consistent with what we do for numeric # factors. (See statsmodels/statsmodels#1881) - if (not iterable(data) - or isinstance(data, (str, bytes))): + if not iterable(data) or isinstance(data, (str, bytes)): data = [data] return data + class CategoricalSniffer(object): def __init__(self, NA_action, origin=None): self._NA_action = NA_action @@ -210,17 +219,21 @@ def sniff(self, data): try: self._level_set.add(value) except TypeError: - raise PatsyError("Error interpreting categorical data: " - "all items must be hashable", - self._origin) + raise PatsyError( + "Error interpreting categorical data: " + "all items must be hashable", + self._origin, + ) # If everything we've seen is boolean, assume that everything else # would be too. Otherwise we need to keep looking. return self._level_set == set([True, False]) __getstate__ = no_pickling + def test_CategoricalSniffer(): from patsy.missing import NAAction + def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): sniffer = CategoricalSniffer(NAAction(NA_types=NA_types)) for data in datas: @@ -236,19 +249,24 @@ def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): # We make sure to test with both boxed and unboxed pandas objects, # because we used to have a bug where boxed pandas objects would be # treated as categorical, but their levels would be lost... - preps = [lambda x: x, - C] + preps = [lambda x: x, C] if have_pandas_categorical_dtype: - preps += [pandas.Series, - lambda x: C(pandas.Series(x))] + preps += [pandas.Series, lambda x: C(pandas.Series(x))] for prep in preps: - t([], [prep(pandas.Categorical([1, 2, None]))], - True, (1, 2)) + t([], [prep(pandas.Categorical([1, 2, None]))], True, (1, 2)) # check order preservation - t([], [prep(pandas_Categorical_from_codes([1, 0], ["a", "b"]))], - True, ("a", "b")) - t([], [prep(pandas_Categorical_from_codes([1, 0], ["b", "a"]))], - True, ("b", "a")) + t( + [], + [prep(pandas_Categorical_from_codes([1, 0], ["a", "b"]))], + True, + ("a", "b"), + ) + t( + [], + [prep(pandas_Categorical_from_codes([1, 0], ["b", "a"]))], + True, + ("b", "a"), + ) # check that if someone sticks a .contrast field onto our object obj = prep(pandas.Categorical(["a", "b"])) obj.contrast = "CONTRAST" @@ -260,8 +278,7 @@ def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): t([], [C([1, 2], levels=[3, 2, 1]), C([4, 2])], True, (3, 2, 1)) # do some actual sniffing with NAs in - t(["None", "NaN"], [C([1, np.nan]), C([10, None])], - False, (1, 10)) + t(["None", "NaN"], [C([1, np.nan]), C([10, None])], False, (1, 10)) # But 'None' can be a type if we don't make it represent NA: sniffer = CategoricalSniffer(NAAction(NA_types=["NaN"])) sniffer.sniff(C([1, np.nan, None])) @@ -273,17 +290,18 @@ def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): assert set(levels) == set([None, 1]) # bool special cases - t(["None", "NaN"], [C([True, np.nan, None])], - True, (False, True)) - t([], [C([10, 20]), C([False]), C([30, 40])], - False, (False, True, 10, 20, 30, 40)) + t(["None", "NaN"], [C([True, np.nan, None])], True, (False, True)) + t([], [C([10, 20]), C([False]), C([30, 40])], False, (False, True, 10, 20, 30, 40)) # exercise the fast-path - t([], [np.asarray([True, False]), ["foo"]], - True, (False, True)) + t([], [np.asarray([True, False]), ["foo"]], True, (False, True)) # check tuples too - t(["None", "NaN"], [C([("b", 2), None, ("a", 1), np.nan, ("c", None)])], - False, (("a", 1), ("b", 2), ("c", None))) + t( + ["None", "NaN"], + [C([("b", 2), None, ("a", 1), np.nan, ("c", None)])], + False, + (("a", 1), ("b", 2), ("c", None)), + ) # contrasts t([], [C([10, 20], contrast="FOO")], False, (10, 20), "FOO") @@ -304,6 +322,7 @@ def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): # >1d is illegal pytest.raises(PatsyError, sniffer.sniff, np.asarray([["b"]])) + # returns either a 1d ndarray or a pandas.Series def categorical_to_int(data, levels, NA_action, origin=None): assert isinstance(levels, tuple) @@ -312,16 +331,21 @@ def categorical_to_int(data, levels, NA_action, origin=None): if safe_is_pandas_categorical(data): data_levels_tuple = tuple(pandas_Categorical_categories(data)) if not data_levels_tuple == levels: - raise PatsyError("mismatching levels: expected %r, got %r" - % (levels, data_levels_tuple), origin) + raise PatsyError( + "mismatching levels: expected %r, got %r" % (levels, data_levels_tuple), + origin, + ) # pandas.Categorical also uses -1 to indicate NA, and we don't try to # second-guess its NA detection, so we can just pass it back. return pandas_Categorical_codes(data) if isinstance(data, _CategoricalBox): if data.levels is not None and tuple(data.levels) != levels: - raise PatsyError("mismatching levels: expected %r, got %r" - % (levels, tuple(data.levels)), origin) + raise PatsyError( + "mismatching levels: expected %r, got %r" + % (levels, tuple(data.levels)), + origin, + ) data = data.data data = _categorical_shape_fix(data) @@ -329,8 +353,9 @@ def categorical_to_int(data, levels, NA_action, origin=None): try: level_to_int = dict(zip(levels, range(len(levels)))) except TypeError: - raise PatsyError("Error interpreting categorical data: " - "all items must be hashable", origin) + raise PatsyError( + "Error interpreting categorical data: " "all items must be hashable", origin + ) # fastpath to avoid doing an item-by-item iteration over boolean arrays, # as requested by #44 @@ -350,42 +375,52 @@ def categorical_to_int(data, levels, NA_action, origin=None): if len(levels) <= SHOW_LEVELS: level_strs += [repr(level) for level in levels] else: - level_strs += [repr(level) - for level in levels[:SHOW_LEVELS//2]] + level_strs += [repr(level) for level in levels[: SHOW_LEVELS // 2]] level_strs.append("...") - level_strs += [repr(level) - for level in levels[-SHOW_LEVELS//2:]] + level_strs += [repr(level) for level in levels[-SHOW_LEVELS // 2 :]] level_str = "[%s]" % (", ".join(level_strs)) - raise PatsyError("Error converting data to categorical: " - "observation with value %r does not match " - "any of the expected levels (expected: %s)" - % (value, level_str), origin) + raise PatsyError( + "Error converting data to categorical: " + "observation with value %r does not match " + "any of the expected levels (expected: %s)" % (value, level_str), + origin, + ) except TypeError: - raise PatsyError("Error converting data to categorical: " - "encountered unhashable value %r" - % (value,), origin) + raise PatsyError( + "Error converting data to categorical: " + "encountered unhashable value %r" % (value,), + origin, + ) if have_pandas and isinstance(data, pandas.Series): out = pandas.Series(out, index=data.index) return out + def test_categorical_to_int(): import pytest from patsy.missing import NAAction + if have_pandas: s = pandas.Series(["a", "b", "c"], index=[10, 20, 30]) c_pandas = categorical_to_int(s, ("a", "b", "c"), NAAction()) assert np.all(c_pandas == [0, 1, 2]) assert np.all(c_pandas.index == [10, 20, 30]) # Input must be 1-dimensional - pytest.raises(PatsyError, - categorical_to_int, - pandas.DataFrame({10: s}), ("a", "b", "c"), NAAction()) + pytest.raises( + PatsyError, + categorical_to_int, + pandas.DataFrame({10: s}), + ("a", "b", "c"), + NAAction(), + ) if have_pandas_categorical: constructors = [pandas_Categorical_from_codes] if have_pandas_categorical_dtype: + def Series_from_codes(codes, categories): c = pandas_Categorical_from_codes(codes, categories) return pandas.Series(c) + constructors.append(Series_from_codes) for con in constructors: cat = con([1, 0, -1], ("a", "b")) @@ -393,20 +428,23 @@ def Series_from_codes(codes, categories): assert np.all(conv == [1, 0, -1]) # Trust pandas NA marking cat2 = con([1, 0, -1], ("a", "None")) - conv2 = categorical_to_int(cat, ("a", "b"), - NAAction(NA_types=["None"])) + conv2 = categorical_to_int(cat, ("a", "b"), NAAction(NA_types=["None"])) assert np.all(conv2 == [1, 0, -1]) # But levels must match - pytest.raises(PatsyError, - categorical_to_int, - con([1, 0], ("a", "b")), - ("a", "c"), - NAAction()) - pytest.raises(PatsyError, - categorical_to_int, - con([1, 0], ("a", "b")), - ("b", "a"), - NAAction()) + pytest.raises( + PatsyError, + categorical_to_int, + con([1, 0], ("a", "b")), + ("a", "c"), + NAAction(), + ) + pytest.raises( + PatsyError, + categorical_to_int, + con([1, 0], ("a", "b")), + ("b", "a"), + NAAction(), + ) def t(data, levels, expected, NA_action=NAAction()): got = categorical_to_int(data, levels, NA_action) @@ -422,16 +460,21 @@ def t(data, levels, expected, NA_action=NAAction()): t(["a", "b", "a"], ("a", "d", "z", "b"), [0, 3, 0]) t([("a", 1), ("b", 0), ("a", 1)], (("a", 1), ("b", 0)), [0, 1, 0]) - pytest.raises(PatsyError, categorical_to_int, - ["a", "b", "a"], ("a", "c"), NAAction()) + pytest.raises( + PatsyError, categorical_to_int, ["a", "b", "a"], ("a", "c"), NAAction() + ) t(C(["a", "b", "a"]), ("a", "b"), [0, 1, 0]) t(C(["a", "b", "a"]), ("b", "a"), [1, 0, 1]) t(C(["a", "b", "a"], levels=["b", "a"]), ("b", "a"), [1, 0, 1]) # Mismatch between C() levels and expected levels - pytest.raises(PatsyError, categorical_to_int, - C(["a", "b", "a"], levels=["a", "b"]), - ("b", "a"), NAAction()) + pytest.raises( + PatsyError, + categorical_to_int, + C(["a", "b", "a"], levels=["a", "b"]), + ("b", "a"), + NAAction(), + ) # ndim == 0 is okay t("a", ("a", "b"), [0]) @@ -439,26 +482,47 @@ def t(data, levels, expected, NA_action=NAAction()): t(True, (False, True), [1]) # ndim == 2 is disallowed - pytest.raises(PatsyError, categorical_to_int, - np.asarray([["a", "b"], ["b", "a"]]), - ("a", "b"), NAAction()) + pytest.raises( + PatsyError, + categorical_to_int, + np.asarray([["a", "b"], ["b", "a"]]), + ("a", "b"), + NAAction(), + ) # levels must be hashable - pytest.raises(PatsyError, categorical_to_int, - ["a", "b"], ("a", "b", {}), NAAction()) - pytest.raises(PatsyError, categorical_to_int, - ["a", "b", {}], ("a", "b"), NAAction()) - - t(["b", None, np.nan, "a"], ("a", "b"), [1, -1, -1, 0], - NAAction(NA_types=["None", "NaN"])) - t(["b", None, np.nan, "a"], ("a", "b", None), [1, -1, -1, 0], - NAAction(NA_types=["None", "NaN"])) - t(["b", None, np.nan, "a"], ("a", "b", None), [1, 2, -1, 0], - NAAction(NA_types=["NaN"])) + pytest.raises( + PatsyError, categorical_to_int, ["a", "b"], ("a", "b", {}), NAAction() + ) + pytest.raises( + PatsyError, categorical_to_int, ["a", "b", {}], ("a", "b"), NAAction() + ) + + t( + ["b", None, np.nan, "a"], + ("a", "b"), + [1, -1, -1, 0], + NAAction(NA_types=["None", "NaN"]), + ) + t( + ["b", None, np.nan, "a"], + ("a", "b", None), + [1, -1, -1, 0], + NAAction(NA_types=["None", "NaN"]), + ) + t( + ["b", None, np.nan, "a"], + ("a", "b", None), + [1, 2, -1, 0], + NAAction(NA_types=["NaN"]), + ) # Smoke test for the branch that formats the ellipsized list of levels in # the error message: - pytest.raises(PatsyError, categorical_to_int, - ["a", "b", "q"], - ("a", "b", "c", "d", "e", "f", "g", "h"), - NAAction()) + pytest.raises( + PatsyError, + categorical_to_int, + ["a", "b", "q"], + ("a", "b", "c", "d", "e", "f", "g", "h"), + NAAction(), + ) diff --git a/patsy/compat.py b/patsy/compat.py index 882e13e..5d56d22 100644 --- a/patsy/compat.py +++ b/patsy/compat.py @@ -9,6 +9,7 @@ ##### Numpy import os + # To force use of the compat code, set this env var to a non-empty value: optional_dep_ok = not os.environ.get("PATSY_AVOID_OPTIONAL_DEPENDENCIES") @@ -23,6 +24,7 @@ # OrderedDict is only available in Python 2.7+. compat_ordereddict.py has # comments at the top. import collections + if optional_dep_ok and hasattr(collections, "OrderedDict"): from collections import OrderedDict else: @@ -32,11 +34,10 @@ import sys from patsy import PatsyError + def call_and_wrap_exc(msg, origin, f, *args, **kwargs): try: return f(*args, **kwargs) except Exception as e: - new_exc = PatsyError("%s: %s: %s" - % (msg, e.__class__.__name__, e), - origin) + new_exc = PatsyError("%s: %s: %s" % (msg, e.__class__.__name__, e), origin) raise new_exc from e diff --git a/patsy/compat_ordereddict.py b/patsy/compat_ordereddict.py index 9fd11f7..644a662 100644 --- a/patsy/compat_ordereddict.py +++ b/patsy/compat_ordereddict.py @@ -1,27 +1,29 @@ # Backport of OrderedDict() class that runs on Python 2.4, 2.5, 2.6, 2.7 and pypy. # Passes Python2.7's test suite and incorporates all the latest updates. -#Author: Raymond Hettinger -#License: MIT License -#http://code.activestate.com/recipes/576693/ revision 9, downloaded 2012-03-28 +# Author: Raymond Hettinger +# License: MIT License +# http://code.activestate.com/recipes/576693/ revision 9, downloaded 2012-03-28 try: from thread import get_ident as _get_ident except ImportError: # Hacked by njs -- I don't have dummy_thread and py3 doesn't have thread, # so the import fails when nosetests3 tries to load this file. - #from dummy_thread import get_ident as _get_ident + # from dummy_thread import get_ident as _get_ident def _get_ident(): return "" + try: from _abcoll import KeysView, ValuesView, ItemsView except ImportError: pass -class OrderedDict(dict): # pragma: no cover - 'Dictionary that remembers insertion order' +class OrderedDict(dict): # pragma: no cover + "Dictionary that remembers insertion order" + # An inherited dict maps keys to values. # The inherited dict provides __getitem__, __len__, __contains__, and get. # The remaining methods are order-aware. @@ -33,23 +35,23 @@ class OrderedDict(dict): # pragma: no cover # Each link is stored as a list of length three: [PREV, NEXT, KEY]. def __init__(self, *args, **kwds): - '''Initialize an ordered dictionary. Signature is the same as for + """Initialize an ordered dictionary. Signature is the same as for regular dictionaries, but keyword arguments are not recommended because their insertion order is arbitrary. - ''' + """ if len(args) > 1: - raise TypeError('expected at most 1 arguments, got %d' % len(args)) + raise TypeError("expected at most 1 arguments, got %d" % len(args)) try: self.__root except AttributeError: - self.__root = root = [] # sentinel node + self.__root = root = [] # sentinel node root[:] = [root, root, None] self.__map = {} self.__update(*args, **kwds) def __setitem__(self, key, value, dict_setitem=dict.__setitem__): - 'od.__setitem__(i, y) <==> od[i]=y' + "od.__setitem__(i, y) <==> od[i]=y" # Setting a new item creates a new link which goes at the end of the linked # list, and the inherited dictionary is updated with the new key/value pair. if key not in self: @@ -59,7 +61,7 @@ def __setitem__(self, key, value, dict_setitem=dict.__setitem__): dict_setitem(self, key, value) def __delitem__(self, key, dict_delitem=dict.__delitem__): - 'od.__delitem__(y) <==> del od[y]' + "od.__delitem__(y) <==> del od[y]" # Deleting an existing item uses self.__map to find the link which is # then removed by updating the links in the predecessor and successor nodes. dict_delitem(self, key) @@ -68,7 +70,7 @@ def __delitem__(self, key, dict_delitem=dict.__delitem__): link_next[0] = link_prev def __iter__(self): - 'od.__iter__() <==> iter(od)' + "od.__iter__() <==> iter(od)" root = self.__root curr = root[1] while curr is not root: @@ -76,7 +78,7 @@ def __iter__(self): curr = curr[1] def __reversed__(self): - 'od.__reversed__() <==> reversed(od)' + "od.__reversed__() <==> reversed(od)" root = self.__root curr = root[0] while curr is not root: @@ -84,7 +86,7 @@ def __reversed__(self): curr = curr[0] def clear(self): - 'od.clear() -> None. Remove all items from od.' + "od.clear() -> None. Remove all items from od." try: for node in self.__map.itervalues(): del node[:] @@ -96,12 +98,12 @@ def clear(self): dict.clear(self) def popitem(self, last=True): - '''od.popitem() -> (k, v), return and remove a (key, value) pair. + """od.popitem() -> (k, v), return and remove a (key, value) pair. Pairs are returned in LIFO order if last is true or FIFO order if false. - ''' + """ if not self: - raise KeyError('dictionary is empty') + raise KeyError("dictionary is empty") root = self.__root if last: link = root[0] @@ -121,45 +123,47 @@ def popitem(self, last=True): # -- the following methods do not depend on the internal structure -- def keys(self): - 'od.keys() -> list of keys in od' + "od.keys() -> list of keys in od" return list(self) def values(self): - 'od.values() -> list of values in od' + "od.values() -> list of values in od" return [self[key] for key in self] def items(self): - 'od.items() -> list of (key, value) pairs in od' + "od.items() -> list of (key, value) pairs in od" return [(key, self[key]) for key in self] def iterkeys(self): - 'od.iterkeys() -> an iterator over the keys in od' + "od.iterkeys() -> an iterator over the keys in od" return iter(self) def itervalues(self): - 'od.itervalues -> an iterator over the values in od' + "od.itervalues -> an iterator over the values in od" for k in self: yield self[k] def iteritems(self): - 'od.iteritems -> an iterator over the (key, value) items in od' + "od.iteritems -> an iterator over the (key, value) items in od" for k in self: yield (k, self[k]) def update(*args, **kwds): - '''od.update(E, **F) -> None. Update od from dict/iterable E and F. + """od.update(E, **F) -> None. Update od from dict/iterable E and F. If E is a dict instance, does: for k in E: od[k] = E[k] If E has a .keys() method, does: for k in E.keys(): od[k] = E[k] Or if E is an iterable of items, does: for k, v in E: od[k] = v In either case, this is followed by: for k, v in F.items(): od[k] = v - ''' + """ if len(args) > 2: - raise TypeError('update() takes at most 2 positional ' - 'arguments (%d given)' % (len(args),)) + raise TypeError( + "update() takes at most 2 positional " + "arguments (%d given)" % (len(args),) + ) elif not args: - raise TypeError('update() takes at least 1 argument (0 given)') + raise TypeError("update() takes at least 1 argument (0 given)") self = args[0] # Make progressively weaker assumptions about "other" other = () @@ -168,7 +172,7 @@ def update(*args, **kwds): if isinstance(other, dict): for key in other: self[key] = other[key] - elif hasattr(other, 'keys'): + elif hasattr(other, "keys"): for key in other.keys(): self[key] = other[key] else: @@ -182,10 +186,10 @@ def update(*args, **kwds): __marker = object() def pop(self, key, default=__marker): - '''od.pop(k[,d]) -> v, remove specified key and return the corresponding value. + """od.pop(k[,d]) -> v, remove specified key and return the corresponding value. If key is not found, d is returned if given, otherwise KeyError is raised. - ''' + """ if key in self: result = self[key] del self[key] @@ -195,27 +199,27 @@ def pop(self, key, default=__marker): return default def setdefault(self, key, default=None): - 'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od' + "od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od" if key in self: return self[key] self[key] = default return default def __repr__(self, _repr_running={}): - 'od.__repr__() <==> repr(od)' + "od.__repr__() <==> repr(od)" call_key = id(self), _get_ident() if call_key in _repr_running: - return '...' + return "..." _repr_running[call_key] = 1 try: if not self: - return '%s()' % (self.__class__.__name__,) - return '%s(%r)' % (self.__class__.__name__, self.items()) + return "%s()" % (self.__class__.__name__,) + return "%s(%r)" % (self.__class__.__name__, self.items()) finally: del _repr_running[call_key] def __reduce__(self): - 'Return state information for pickling' + "Return state information for pickling" items = [[k, self[k]] for k in self] inst_dict = vars(self).copy() for k in vars(OrderedDict()): @@ -225,27 +229,27 @@ def __reduce__(self): return self.__class__, (items,) def copy(self): - 'od.copy() -> a shallow copy of od' + "od.copy() -> a shallow copy of od" return self.__class__(self) @classmethod def fromkeys(cls, iterable, value=None): - '''OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S + """OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S and values equal to v (which defaults to None). - ''' + """ d = cls() for key in iterable: d[key] = value return d def __eq__(self, other): - '''od.__eq__(y) <==> od==y. Comparison to another OD is order-sensitive + """od.__eq__(y) <==> od==y. Comparison to another OD is order-sensitive while comparison to a regular mapping is order-insensitive. - ''' + """ if isinstance(other, OrderedDict): - return len(self)==len(other) and self.items() == other.items() + return len(self) == len(other) and self.items() == other.items() return dict.__eq__(self, other) def __ne__(self, other): diff --git a/patsy/constraint.py b/patsy/constraint.py index 012b226..6147a70 100644 --- a/patsy/constraint.py +++ b/patsy/constraint.py @@ -8,6 +8,7 @@ __all__ = ["LinearConstraint"] import re + try: from collections.abc import Mapping except ImportError: @@ -15,9 +16,13 @@ import numpy as np from patsy import PatsyError from patsy.origin import Origin -from patsy.util import (atleast_2d_column_default, - repr_pretty_delegate, repr_pretty_impl, - no_pickling, assert_no_pickling) +from patsy.util import ( + atleast_2d_column_default, + repr_pretty_delegate, + repr_pretty_impl, + no_pickling, + assert_no_pickling, +) from patsy.infix_parser import Token, Operator, infix_parse from patsy.parse_formula import _parsing_error_test @@ -44,6 +49,7 @@ class LinearConstraint(object): A list of strings giving the names of the variables being constrained. (Used only for consistency checking.) """ + def __init__(self, variable_names, coefs, constants=None): self.variable_names = list(variable_names) self.coefs = np.atleast_2d(np.asarray(coefs, dtype=float)) @@ -61,10 +67,12 @@ def __init__(self, variable_names, coefs, constants=None): raise ValueError("shape mismatch between coefs and constants") __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): assert not cycle - return repr_pretty_impl(p, self, - [self.variable_names, self.coefs, self.constants]) + return repr_pretty_impl( + p, self, [self.variable_names, self.coefs, self.constants] + ) __getstate__ = no_pickling @@ -87,6 +95,7 @@ def combine(cls, constraints): constants = np.vstack([c.constants for c in constraints]) return cls(variable_names, coefs, constants) + def test_LinearConstraint(): try: from numpy.testing import assert_equal @@ -104,7 +113,6 @@ def test_LinearConstraint(): assert lc.coefs.dtype == np.dtype(float) assert lc.constants.dtype == np.dtype(float) - # statsmodels wants to be able to create degenerate constraints like this, # see: # https://github.com/pydata/patsy/issues/89 @@ -113,20 +121,25 @@ def test_LinearConstraint(): assert_equal(lc.coefs, [[0]]) import pytest + pytest.raises(ValueError, LinearConstraint, ["a"], [[1, 2]]) pytest.raises(ValueError, LinearConstraint, ["a"], [[[1]]]) pytest.raises(ValueError, LinearConstraint, ["a"], [[1, 2]], [3, 4]) pytest.raises(ValueError, LinearConstraint, ["a", "b"], [[1, 2]], [3, 4]) pytest.raises(ValueError, LinearConstraint, ["a"], [[1]], [[]]) pytest.raises(ValueError, LinearConstraint, ["a", "b"], []) - pytest.raises(ValueError, LinearConstraint, ["a", "b"], - np.zeros((0, 2))) + pytest.raises(ValueError, LinearConstraint, ["a", "b"], np.zeros((0, 2))) assert_no_pickling(lc) + def test_LinearConstraint_combine(): - comb = LinearConstraint.combine([LinearConstraint(["a", "b"], [1, 0]), - LinearConstraint(["a", "b"], [0, 1], [1])]) + comb = LinearConstraint.combine( + [ + LinearConstraint(["a", "b"], [1, 0]), + LinearConstraint(["a", "b"], [0, 1], [1]), + ] + ) assert comb.variable_names == ["a", "b"] try: from numpy.testing import assert_equal @@ -136,38 +149,40 @@ def test_LinearConstraint_combine(): assert_equal(comb.constants, [[0], [1]]) import pytest + pytest.raises(ValueError, LinearConstraint.combine, []) - pytest.raises(ValueError, LinearConstraint.combine, - [LinearConstraint(["a"], [1]), LinearConstraint(["b"], [1])]) + pytest.raises( + ValueError, + LinearConstraint.combine, + [LinearConstraint(["a"], [1]), LinearConstraint(["b"], [1])], + ) _ops = [ Operator(",", 2, -100), - Operator("=", 2, 0), - Operator("+", 1, 100), Operator("-", 1, 100), Operator("+", 2, 100), Operator("-", 2, 100), - Operator("*", 2, 200), Operator("/", 2, 200), - ] +] _atomic = ["NUMBER", "VARIABLE"] + def _token_maker(type, string): def make_token(scanner, token_string): if type == "__OP__": actual_type = token_string else: actual_type = type - return Token(actual_type, - Origin(string, *scanner.match.span()), - token_string) + return Token(actual_type, Origin(string, *scanner.match.span()), token_string) + return make_token + def _tokenize_constraint(string, variable_names): lparen_re = r"\(" rparen_re = r"\)" @@ -186,29 +201,33 @@ def _tokenize_constraint(string, variable_names): (variable_re, _token_maker("VARIABLE", string)), (num_re, _token_maker("NUMBER", string)), (whitespace_re, None), - ] + ] scanner = re.Scanner(lexicon) tokens, leftover = scanner.scan(string) if leftover: offset = len(string) - len(leftover) - raise PatsyError("unrecognized token in constraint", - Origin(string, offset, offset + 1)) + raise PatsyError( + "unrecognized token in constraint", Origin(string, offset, offset + 1) + ) return tokens + def test__tokenize_constraint(): code = "2 * (a + b) = q" tokens = _tokenize_constraint(code, ["a", "b", "q"]) - expecteds = [("NUMBER", 0, 1, "2"), - ("*", 2, 3, "*"), - (Token.LPAREN, 4, 5, "("), - ("VARIABLE", 5, 6, "a"), - ("+", 7, 8, "+"), - ("VARIABLE", 9, 10, "b"), - (Token.RPAREN, 10, 11, ")"), - ("=", 12, 13, "="), - ("VARIABLE", 14, 15, "q")] + expecteds = [ + ("NUMBER", 0, 1, "2"), + ("*", 2, 3, "*"), + (Token.LPAREN, 4, 5, "("), + ("VARIABLE", 5, 6, "a"), + ("+", 7, 8, "+"), + ("VARIABLE", 9, 10, "b"), + (Token.RPAREN, 10, 11, ")"), + ("=", 12, 13, "="), + ("VARIABLE", 14, 15, "q"), + ] for got, expected in zip(tokens, expecteds): assert isinstance(got, Token) assert got.type == expected[0] @@ -216,6 +235,7 @@ def test__tokenize_constraint(): assert got.extra == expected[3] import pytest + pytest.raises(PatsyError, _tokenize_constraint, "1 + @b", ["b"]) # Shouldn't raise an error: _tokenize_constraint("1 + @b", ["@b"]) @@ -233,9 +253,10 @@ def test__tokenize_constraint(): assert [t.type for t in tokens] == ["NUMBER", "*", "VARIABLE", ","] assert [t.extra for t in tokens] == ["2", "*", "a[1,1]", ","] + def parse_constraint(string, variable_names): - return infix_parse(_tokenize_constraint(string, variable_names), - _ops, _atomic) + return infix_parse(_tokenize_constraint(string, variable_names), _ops, _atomic) + class _EvalConstraint(object): def __init__(self, variable_names): @@ -253,7 +274,7 @@ def __init__(self, variable_names): ("/", 2): self._eval_binary_div, ("=", 2): self._eval_binary_eq, (",", 2): self._eval_binary_comma, - } + } # General scheme: there are 2 types we deal with: # - linear combinations ("lincomb"s) of variables and constants, @@ -263,7 +284,7 @@ def __init__(self, variable_names): # - LinearConstraint objects def is_constant(self, coefs): - return np.all(coefs[:self._N] == 0) + return np.all(coefs[: self._N] == 0) def _eval_variable(self, tree): var = tree.token.extra @@ -292,8 +313,9 @@ def _eval_binary_div(self, tree): left = self.eval(tree.args[0]) right = self.eval(tree.args[1]) if not self.is_constant(right): - raise PatsyError("Can't divide by a variable in a linear " - "constraint", tree.args[1]) + raise PatsyError( + "Can't divide by a variable in a linear " "constraint", tree.args[1] + ) return left / right[-1] def _eval_binary_multiply(self, tree): @@ -304,8 +326,9 @@ def _eval_binary_multiply(self, tree): elif self.is_constant(right): return left * right[-1] else: - raise PatsyError("Can't multiply one variable by another " - "in a linear constraint", tree) + raise PatsyError( + "Can't multiply one variable by another " "in a linear constraint", tree + ) def _eval_binary_eq(self, tree): # Handle "a1 = a2 = a3", which is parsed as "(a1 = a2) = a3" @@ -319,7 +342,7 @@ def _eval_binary_eq(self, tree): args[i] = arg.args[1 - i] left = self.eval(args[0]) right = self.eval(args[1]) - coefs = left[:self._N] - right[:self._N] + coefs = left[: self._N] - right[: self._N] if np.all(coefs == 0): raise PatsyError("no variables appear in constraint", tree) constant = -left[-1] + right[-1] @@ -342,35 +365,33 @@ def eval(self, tree, constraint=False): return val else: assert val.size == self._N + 1 - if np.all(val[:self._N] == 0): - raise PatsyError("term is constant, with no variables", - tree) - return LinearConstraint(self._variable_names, - val[:self._N], - -val[-1]) + if np.all(val[: self._N] == 0): + raise PatsyError("term is constant, with no variables", tree) + return LinearConstraint(self._variable_names, val[: self._N], -val[-1]) else: # Force it to *not* be a constraint if isinstance(val, LinearConstraint): raise PatsyError("unexpected constraint object", tree) return val + def linear_constraint(constraint_like, variable_names): """This is the internal interface implementing DesignInfo.linear_constraint, see there for docs.""" if isinstance(constraint_like, LinearConstraint): if constraint_like.variable_names != variable_names: - raise ValueError("LinearConstraint has wrong variable_names " - "(got %r, expected %r)" - % (constraint_like.variable_names, - variable_names)) + raise ValueError( + "LinearConstraint has wrong variable_names " + "(got %r, expected %r)" + % (constraint_like.variable_names, variable_names) + ) return constraint_like if isinstance(constraint_like, Mapping): # Simple conjunction-of-equality constraints can be specified as # dicts. {"x": 1, "y": 2} -> tests x = 1 and y = 2. Keys can be # either variable names, or variable indices. - coefs = np.zeros((len(constraint_like), len(variable_names)), - dtype=float) + coefs = np.zeros((len(constraint_like), len(variable_names)), dtype=float) constants = np.zeros(len(constraint_like)) used = set() for i, (name, value) in enumerate(constraint_like.items()): @@ -379,11 +400,9 @@ def linear_constraint(constraint_like, variable_names): elif isinstance(name, int): idx = name else: - raise ValueError("unrecognized variable name/index %r" - % (name,)) + raise ValueError("unrecognized variable name/index %r" % (name,)) if idx in used: - raise ValueError("duplicated constraint on %r" - % (variable_names[idx],)) + raise ValueError("duplicated constraint on %r" % (variable_names[idx],)) used.add(idx) coefs[i, idx] = 1 constants[i] = value @@ -393,9 +412,11 @@ def linear_constraint(constraint_like, variable_names): constraint_like = [constraint_like] # fall-through - if (isinstance(constraint_like, list) + if ( + isinstance(constraint_like, list) and constraint_like - and isinstance(constraint_like[0], str)): + and isinstance(constraint_like[0], str) + ): constraints = [] for code in constraint_like: if not isinstance(code, str): @@ -435,24 +456,22 @@ def _check_lincon(input, varnames, coefs, constants): def test_linear_constraint(): import pytest from patsy.compat import OrderedDict + t = _check_lincon t(LinearConstraint(["a", "b"], [2, 3]), ["a", "b"], [[2, 3]], [[0]]) - pytest.raises(ValueError, linear_constraint, - LinearConstraint(["b", "a"], [2, 3]), - ["a", "b"]) + pytest.raises( + ValueError, linear_constraint, LinearConstraint(["b", "a"], [2, 3]), ["a", "b"] + ) t({"a": 2}, ["a", "b"], [[1, 0]], [[2]]) - t(OrderedDict([("a", 2), ("b", 3)]), - ["a", "b"], [[1, 0], [0, 1]], [[2], [3]]) - t(OrderedDict([("a", 2), ("b", 3)]), - ["b", "a"], [[0, 1], [1, 0]], [[2], [3]]) + t(OrderedDict([("a", 2), ("b", 3)]), ["a", "b"], [[1, 0], [0, 1]], [[2], [3]]) + t(OrderedDict([("a", 2), ("b", 3)]), ["b", "a"], [[0, 1], [1, 0]], [[2], [3]]) t({0: 2}, ["a", "b"], [[1, 0]], [[2]]) t(OrderedDict([(0, 2), (1, 3)]), ["a", "b"], [[1, 0], [0, 1]], [[2], [3]]) - t(OrderedDict([("a", 2), (1, 3)]), - ["a", "b"], [[1, 0], [0, 1]], [[2], [3]]) + t(OrderedDict([("a", 2), (1, 3)]), ["a", "b"], [[1, 0], [0, 1]], [[2], [3]]) pytest.raises(ValueError, linear_constraint, {"q": 1}, ["a", "b"]) pytest.raises(ValueError, linear_constraint, {"a": 1, 0: 2}, ["a", "b"]) @@ -472,37 +491,46 @@ def test_linear_constraint(): pytest.raises(ValueError, linear_constraint, ["a", {"b": 0}], ["a", "b"]) # Actual evaluator tests - t("2 * (a + b/3) + b + 2*3/4 = 1 + 2*3", ["a", "b"], - [[2, 2.0/3 + 1]], [[7 - 6.0/4]]) + t( + "2 * (a + b/3) + b + 2*3/4 = 1 + 2*3", + ["a", "b"], + [[2, 2.0 / 3 + 1]], + [[7 - 6.0 / 4]], + ) t("+2 * -a", ["a", "b"], [[-2, 0]], [[0]]) t("a - b, a + b = 2", ["a", "b"], [[1, -1], [1, 1]], [[0], [2]]) - t("a = 1, a = 2, a = 3", ["a", "b"], - [[1, 0], [1, 0], [1, 0]], [[1], [2], [3]]) + t("a = 1, a = 2, a = 3", ["a", "b"], [[1, 0], [1, 0], [1, 0]], [[1], [2], [3]]) t("a * 2", ["a", "b"], [[2, 0]], [[0]]) t("-a = 1", ["a", "b"], [[-1, 0]], [[1]]) t("(2 + a - a) * b", ["a", "b"], [[0, 2]], [[0]]) t("a = 1 = b", ["a", "b"], [[1, 0], [0, -1]], [[1], [-1]]) t("a = (1 = b)", ["a", "b"], [[0, -1], [1, 0]], [[-1], [1]]) - t("a = 1, a = b = c", ["a", "b", "c"], - [[1, 0, 0], [1, -1, 0], [0, 1, -1]], [[1], [0], [0]]) + t( + "a = 1, a = b = c", + ["a", "b", "c"], + [[1, 0, 0], [1, -1, 0], [0, 1, -1]], + [[1], [0], [0]], + ) # One should never do this of course, but test that it works anyway... t("a + 1 = 2", ["a", "a + 1"], [[0, 1]], [[2]]) t(([10, 20], [30]), ["a", "b"], [[10, 20]], [[30]]) - t(([[10, 20], [20, 40]], [[30], [35]]), ["a", "b"], - [[10, 20], [20, 40]], [[30], [35]]) + t( + ([[10, 20], [20, 40]], [[30], [35]]), + ["a", "b"], + [[10, 20], [20, 40]], + [[30], [35]], + ) # wrong-length tuple - pytest.raises(ValueError, linear_constraint, - ([1, 0], [0], [0]), ["a", "b"]) + pytest.raises(ValueError, linear_constraint, ([1, 0], [0], [0]), ["a", "b"]) pytest.raises(ValueError, linear_constraint, ([1, 0],), ["a", "b"]) t([10, 20], ["a", "b"], [[10, 20]], [[0]]) t([[10, 20], [20, 40]], ["a", "b"], [[10, 20], [20, 40]], [[0], [0]]) t(np.array([10, 20]), ["a", "b"], [[10, 20]], [[0]]) - t(np.array([[10, 20], [20, 40]]), ["a", "b"], - [[10, 20], [20, 40]], [[0], [0]]) + t(np.array([[10, 20], [20, 40]]), ["a", "b"], [[10, 20], [20, 40]], [[0], [0]]) # unknown object type pytest.raises(ValueError, linear_constraint, None, ["a", "b"]) @@ -529,4 +557,5 @@ def test_linear_constraint(): def test_eval_errors(): def doit(bad_code): return linear_constraint(bad_code, ["a", "b", "c"]) + _parsing_error_test(doit, _parse_eval_error_tests) diff --git a/patsy/contrasts.py b/patsy/contrasts.py index c3e6921..0ac9ac7 100644 --- a/patsy/contrasts.py +++ b/patsy/contrasts.py @@ -10,9 +10,14 @@ import numpy as np from patsy import PatsyError -from patsy.util import (repr_pretty_delegate, repr_pretty_impl, - safe_issubdtype, - no_pickling, assert_no_pickling) +from patsy.util import ( + repr_pretty_delegate, + repr_pretty_impl, + safe_issubdtype, + no_pickling, + assert_no_pickling, +) + class ContrastMatrix: """A simple container for a matrix used for coding categorical factors. @@ -33,6 +38,7 @@ class ContrastMatrix: final column names. E.g. for treatment coding the entries will look like ``"[T.level1]"``. """ + def __init__(self, matrix, column_suffixes): self.matrix = np.asarray(matrix) self.column_suffixes = column_suffixes @@ -40,11 +46,13 @@ def __init__(self, matrix, column_suffixes): raise PatsyError("matrix and column_suffixes don't conform") __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): repr_pretty_impl(p, self, [self.matrix, self.column_suffixes]) __getstate__ = no_pickling + def test_ContrastMatrix(): cm = ContrastMatrix([[1, 0], [0, 1]], ["a", "b"]) assert np.array_equal(cm.matrix, np.eye(2)) @@ -53,10 +61,12 @@ def test_ContrastMatrix(): repr(cm) import pytest + pytest.raises(PatsyError, ContrastMatrix, [[1], [0]], ["a", "b"]) assert_no_pickling(cm) + # This always produces an object of the type that Python calls 'str' (whether # that be a Python 2 string-of-bytes or a Python 3 string-of-unicode). It does # *not* make any particular guarantees about being reversible or having other @@ -73,32 +83,38 @@ def _obj_to_readable_str(obj): else: return repr(obj) + def test__obj_to_readable_str(): def t(obj, expected): got = _obj_to_readable_str(obj) assert type(got) is str assert got == expected + t(1, "1") t(1.0, "1.0") t("asdf", "asdf") - t(u"asdf", "asdf") + t("asdf", "asdf") # we can use "foo".encode here b/c this is python 3! # a utf-8 encoded euro-sign comes out as a real euro sign. - t("\u20ac".encode("utf-8"), u"\u20ac") + t("\u20ac".encode("utf-8"), "\u20ac") # but a iso-8859-15 euro sign can't be decoded, and we fall back on # repr() t("\u20ac".encode("iso-8859-15"), "b'\\xa4'") + def _name_levels(prefix, levels): return ["[%s%s]" % (prefix, _obj_to_readable_str(level)) for level in levels] + def test__name_levels(): assert _name_levels("a", ["b", "c"]) == ["[ab]", "[ac]"] + def _dummy_code(levels): return ContrastMatrix(np.eye(len(levels)), _name_levels("", levels)) + def _get_level(levels, level_ref): if level_ref in levels: return levels.index(level_ref) @@ -106,11 +122,11 @@ def _get_level(levels, level_ref): if level_ref < 0: level_ref += len(levels) if not (0 <= level_ref < len(levels)): - raise PatsyError("specified level %r is out of range" - % (level_ref,)) + raise PatsyError("specified level %r is out of range" % (level_ref,)) return level_ref raise PatsyError("specified level %r not found" % (level_ref,)) + def test__get_level(): assert _get_level(["a", "b", "c"], 0) == 0 assert _get_level(["a", "b", "c"], -1) == 2 @@ -118,6 +134,7 @@ def test__get_level(): # For integer levels, we check identity before treating it as an index assert _get_level([2, 1, 0], 0) == 2 import pytest + pytest.raises(PatsyError, _get_level, ["a", "b"], 2) pytest.raises(PatsyError, _get_level, ["a", "b"], -3) pytest.raises(PatsyError, _get_level, ["a", "b"], "c") @@ -153,6 +170,7 @@ class Treatment: using ``Treatment(reference=-1)`` will produce contrasts that are "equivalent to those produced by many (but not all) SAS procedures". """ + def __init__(self, reference=None): self.reference = reference @@ -165,14 +183,15 @@ def code_without_intercept(self, levels): else: reference = _get_level(levels, self.reference) eye = np.eye(len(levels) - 1) - contrasts = np.vstack((eye[:reference, :], - np.zeros((1, len(levels) - 1)), - eye[reference:, :])) - names = _name_levels("T.", levels[:reference] + levels[reference + 1:]) + contrasts = np.vstack( + (eye[:reference, :], np.zeros((1, len(levels) - 1)), eye[reference:, :]) + ) + names = _name_levels("T.", levels[:reference] + levels[reference + 1 :]) return ContrastMatrix(contrasts, names) __getstate__ = no_pickling + def test_Treatment(): t1 = Treatment() matrix = t1.code_with_intercept(["a", "b", "c"]) @@ -196,6 +215,7 @@ def test_Treatment(): assert matrix.column_suffixes == ["[T.1]", "[T.0]"] assert np.allclose(matrix.matrix, [[0, 0], [1, 0], [0, 1]]) + class Poly(object): """Orthogonal polynomial contrast coding. @@ -230,6 +250,7 @@ class Poly(object): rank encodings are always dummy-coded, regardless of what contrast you have set.) """ + def __init__(self, scores=None): self.scores = scores @@ -240,9 +261,10 @@ def _code_either(self, intercept, levels): scores = np.arange(n) scores = np.asarray(scores, dtype=float) if len(scores) != n: - raise PatsyError("number of levels (%s) does not match" - " number of scores (%s)" - % (n, len(scores))) + raise PatsyError( + "number of levels (%s) does not match" + " number of scores (%s)" % (n, len(scores)) + ) # Strategy: just make a matrix whose columns are naive linear, # quadratic, etc., functions of the raw scores, and then use 'qr' to # orthogonalize each column against those to its left. @@ -250,7 +272,7 @@ def _code_either(self, intercept, levels): raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1)) q, r = np.linalg.qr(raw_poly) q *= np.sign(np.diag(r)) - q /= np.sqrt(np.sum(q ** 2, axis=1)) + q /= np.sqrt(np.sum(q**2, axis=1)) # The constant term is always all 1's -- we don't normalize it. q[:, 0] = 1 names = [".Constant", ".Linear", ".Quadratic", ".Cubic"] @@ -271,33 +293,44 @@ def code_without_intercept(self, levels): __getstate__ = no_pickling + def test_Poly(): t1 = Poly() matrix = t1.code_with_intercept(["a", "b", "c"]) assert matrix.column_suffixes == [".Constant", ".Linear", ".Quadratic"] # Values from R 'options(digits=15); contr.poly(3)' - expected = [[1, -7.07106781186548e-01, 0.408248290463863], - [1, 0, -0.816496580927726], - [1, 7.07106781186547e-01, 0.408248290463863]] + expected = [ + [1, -7.07106781186548e-01, 0.408248290463863], + [1, 0, -0.816496580927726], + [1, 7.07106781186547e-01, 0.408248290463863], + ] print(matrix.matrix) assert np.allclose(matrix.matrix, expected) matrix = t1.code_without_intercept(["a", "b", "c"]) assert matrix.column_suffixes == [".Linear", ".Quadratic"] # Values from R 'options(digits=15); contr.poly(3)' print(matrix.matrix) - assert np.allclose(matrix.matrix, - [[-7.07106781186548e-01, 0.408248290463863], - [0, -0.816496580927726], - [7.07106781186547e-01, 0.408248290463863]]) + assert np.allclose( + matrix.matrix, + [ + [-7.07106781186548e-01, 0.408248290463863], + [0, -0.816496580927726], + [7.07106781186547e-01, 0.408248290463863], + ], + ) matrix = Poly(scores=[0, 10, 11]).code_with_intercept(["a", "b", "c"]) assert matrix.column_suffixes == [".Constant", ".Linear", ".Quadratic"] # Values from R 'options(digits=15); contr.poly(3, scores=c(0, 10, 11))' print(matrix.matrix) - assert np.allclose(matrix.matrix, - [[1, -0.813733471206735, 0.0671156055214024], - [1, 0.348742916231458, -0.7382716607354268], - [1, 0.464990554975277, 0.6711560552140243]]) + assert np.allclose( + matrix.matrix, + [ + [1, -0.813733471206735, 0.0671156055214024], + [1, 0.348742916231458, -0.7382716607354268], + [1, 0.464990554975277, 0.6711560552140243], + ], + ) # we had an integer/float handling bug for score vectors whose mean was # non-integer, so check one of those: @@ -305,19 +338,28 @@ def test_Poly(): assert matrix.column_suffixes == [".Constant", ".Linear", ".Quadratic"] # Values from R 'options(digits=15); contr.poly(3, scores=c(0, 10, 12))' print(matrix.matrix) - assert np.allclose(matrix.matrix, - [[1, -0.806559132617443, 0.127000127000191], - [1, 0.293294230042706, -0.762000762001143], - [1, 0.513264902574736, 0.635000635000952]]) + assert np.allclose( + matrix.matrix, + [ + [1, -0.806559132617443, 0.127000127000191], + [1, 0.293294230042706, -0.762000762001143], + [1, 0.513264902574736, 0.635000635000952], + ], + ) import pytest - pytest.raises(PatsyError, - Poly(scores=[0, 1]).code_with_intercept, - ["a", "b", "c"]) + + pytest.raises(PatsyError, Poly(scores=[0, 1]).code_with_intercept, ["a", "b", "c"]) matrix = t1.code_with_intercept(list(range(6))) - assert matrix.column_suffixes == [".Constant", ".Linear", ".Quadratic", - ".Cubic", "^4", "^5"] + assert matrix.column_suffixes == [ + ".Constant", + ".Linear", + ".Quadratic", + ".Cubic", + "^4", + "^5", + ] class Sum(object): @@ -349,6 +391,7 @@ class Sum(object): This is equivalent to R's `contr.sum`. """ + def __init__(self, omit=None): self.omit = omit @@ -366,24 +409,24 @@ def _sum_contrast(self, levels): out = np.empty((n, n - 1)) out[:omit_i, :] = eye[:omit_i, :] out[omit_i, :] = -1 - out[omit_i + 1:, :] = eye[omit_i:, :] + out[omit_i + 1 :, :] = eye[omit_i:, :] return out def code_with_intercept(self, levels): contrast = self.code_without_intercept(levels) - matrix = np.column_stack((np.ones(len(levels)), - contrast.matrix)) + matrix = np.column_stack((np.ones(len(levels)), contrast.matrix)) column_suffixes = ["[mean]"] + contrast.column_suffixes return ContrastMatrix(matrix, column_suffixes) def code_without_intercept(self, levels): matrix = self._sum_contrast(levels) omit_i = self._omit_i(levels) - included_levels = levels[:omit_i] + levels[omit_i + 1:] + included_levels = levels[:omit_i] + levels[omit_i + 1 :] return ContrastMatrix(matrix, _name_levels("S.", included_levels)) __getstate__ = no_pickling + def test_Sum(): t1 = Sum() matrix = t1.code_with_intercept(["a", "b", "c"]) @@ -421,6 +464,7 @@ def test_Sum(): assert matrix.column_suffixes == ["[S.b]", "[S.c]"] assert np.allclose(matrix.matrix, [[-1, -1], [1, 0], [0, 1]]) + class Helmert(object): """Helmert contrasts. @@ -444,59 +488,58 @@ class Helmert(object): This is equivalent to R's `contr.helmert`. """ + def _helmert_contrast(self, levels): n = len(levels) - #http://www.ats.ucla.edu/stat/sas/webbooks/reg/chapter5/sasreg5.htm#HELMERT - #contr = np.eye(n - 1) - #int_range = np.arange(n - 1., 1, -1) - #denom = np.repeat(int_range, np.arange(n - 2, 0, -1)) - #contr[np.tril_indices(n - 1, -1)] = -1. / denom - - #http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#HELMERT - #contr = np.zeros((n - 1., n - 1)) - #int_range = np.arange(n, 1, -1) - #denom = np.repeat(int_range[:-1], np.arange(n - 2, 0, -1)) - #contr[np.diag_indices(n - 1)] = (int_range - 1.) / int_range - #contr[np.tril_indices(n - 1, -1)] = -1. / denom - #contr = np.vstack((contr, -1./int_range)) - - #r-like + # http://www.ats.ucla.edu/stat/sas/webbooks/reg/chapter5/sasreg5.htm#HELMERT + # contr = np.eye(n - 1) + # int_range = np.arange(n - 1., 1, -1) + # denom = np.repeat(int_range, np.arange(n - 2, 0, -1)) + # contr[np.tril_indices(n - 1, -1)] = -1. / denom + + # http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#HELMERT + # contr = np.zeros((n - 1., n - 1)) + # int_range = np.arange(n, 1, -1) + # denom = np.repeat(int_range[:-1], np.arange(n - 2, 0, -1)) + # contr[np.diag_indices(n - 1)] = (int_range - 1.) / int_range + # contr[np.tril_indices(n - 1, -1)] = -1. / denom + # contr = np.vstack((contr, -1./int_range)) + + # r-like contr = np.zeros((n, n - 1)) contr[1:][np.diag_indices(n - 1)] = np.arange(1, n) contr[np.triu_indices(n - 1)] = -1 return contr def code_with_intercept(self, levels): - contrast = np.column_stack((np.ones(len(levels)), - self._helmert_contrast(levels))) + contrast = np.column_stack( + (np.ones(len(levels)), self._helmert_contrast(levels)) + ) column_suffixes = _name_levels("H.", ["intercept"] + list(levels[1:])) return ContrastMatrix(contrast, column_suffixes) def code_without_intercept(self, levels): contrast = self._helmert_contrast(levels) - return ContrastMatrix(contrast, - _name_levels("H.", levels[1:])) + return ContrastMatrix(contrast, _name_levels("H.", levels[1:])) __getstate__ = no_pickling + def test_Helmert(): t1 = Helmert() for levels in (["a", "b", "c", "d"], ("a", "b", "c", "d")): matrix = t1.code_with_intercept(levels) - assert matrix.column_suffixes == ["[H.intercept]", - "[H.b]", - "[H.c]", - "[H.d]"] - assert np.allclose(matrix.matrix, [[1, -1, -1, -1], - [1, 1, -1, -1], - [1, 0, 2, -1], - [1, 0, 0, 3]]) + assert matrix.column_suffixes == ["[H.intercept]", "[H.b]", "[H.c]", "[H.d]"] + assert np.allclose( + matrix.matrix, + [[1, -1, -1, -1], [1, 1, -1, -1], [1, 0, 2, -1], [1, 0, 0, 3]], + ) matrix = t1.code_without_intercept(levels) assert matrix.column_suffixes == ["[H.b]", "[H.c]", "[H.d]"] - assert np.allclose(matrix.matrix, [[-1, -1, -1], - [1, -1, -1], - [0, 2, -1], - [0, 0, 3]]) + assert np.allclose( + matrix.matrix, [[-1, -1, -1], [1, -1, -1], [0, 2, -1], [0, 0, 3]] + ) + class Diff(object): """Backward difference coding. @@ -517,27 +560,28 @@ class Diff(object): # Full rank dmatrix("0 + C(a, Diff)", balanced(a=3)) """ + def _diff_contrast(self, levels): nlevels = len(levels) - contr = np.zeros((nlevels, nlevels-1)) + contr = np.zeros((nlevels, nlevels - 1)) int_range = np.arange(1, nlevels) upper_int = np.repeat(int_range, int_range) - row_i, col_i = np.triu_indices(nlevels-1) + row_i, col_i = np.triu_indices(nlevels - 1) # we want to iterate down the columns not across the rows # it would be nice if the index functions had a row/col order arg col_order = np.argsort(col_i) - contr[row_i[col_order], - col_i[col_order]] = (upper_int-nlevels)/float(nlevels) + contr[row_i[col_order], col_i[col_order]] = (upper_int - nlevels) / float( + nlevels + ) lower_int = np.repeat(int_range, int_range[::-1]) - row_i, col_i = np.tril_indices(nlevels-1) + row_i, col_i = np.tril_indices(nlevels - 1) # we want to iterate down the columns not across the rows col_order = np.argsort(col_i) - contr[row_i[col_order]+1, col_i[col_order]] = lower_int/float(nlevels) + contr[row_i[col_order] + 1, col_i[col_order]] = lower_int / float(nlevels) return contr def code_with_intercept(self, levels): - contrast = np.column_stack((np.ones(len(levels)), - self._diff_contrast(levels))) + contrast = np.column_stack((np.ones(len(levels)), self._diff_contrast(levels))) return ContrastMatrix(contrast, _name_levels("D.", levels)) def code_without_intercept(self, levels): @@ -546,21 +590,32 @@ def code_without_intercept(self, levels): __getstate__ = no_pickling + def test_diff(): t1 = Diff() matrix = t1.code_with_intercept(["a", "b", "c", "d"]) - assert matrix.column_suffixes == ["[D.a]", "[D.b]", "[D.c]", - "[D.d]"] - assert np.allclose(matrix.matrix, [[1, -3/4., -1/2., -1/4.], - [1, 1/4., -1/2., -1/4.], - [1, 1/4., 1./2, -1/4.], - [1, 1/4., 1/2., 3/4.]]) + assert matrix.column_suffixes == ["[D.a]", "[D.b]", "[D.c]", "[D.d]"] + assert np.allclose( + matrix.matrix, + [ + [1, -3 / 4.0, -1 / 2.0, -1 / 4.0], + [1, 1 / 4.0, -1 / 2.0, -1 / 4.0], + [1, 1 / 4.0, 1.0 / 2, -1 / 4.0], + [1, 1 / 4.0, 1 / 2.0, 3 / 4.0], + ], + ) matrix = t1.code_without_intercept(["a", "b", "c", "d"]) assert matrix.column_suffixes == ["[D.a]", "[D.b]", "[D.c]"] - assert np.allclose(matrix.matrix, [[-3/4., -1/2., -1/4.], - [1/4., -1/2., -1/4.], - [1/4., 2./4, -1/4.], - [1/4., 1/2., 3/4.]]) + assert np.allclose( + matrix.matrix, + [ + [-3 / 4.0, -1 / 2.0, -1 / 4.0], + [1 / 4.0, -1 / 2.0, -1 / 4.0], + [1 / 4.0, 2.0 / 4, -1 / 4.0], + [1 / 4.0, 1 / 2.0, 3 / 4.0], + ], + ) + # contrast can be: # -- a ContrastMatrix @@ -578,10 +633,10 @@ def code_contrast_matrix(intercept, levels, contrast, default=None): return contrast as_array = np.asarray(contrast) if safe_issubdtype(as_array.dtype, np.number): - return ContrastMatrix(as_array, - _name_levels("custom", range(as_array.shape[1]))) + return ContrastMatrix( + as_array, _name_levels("custom", range(as_array.shape[1])) + ) if intercept: return contrast.code_with_intercept(levels) else: return contrast.code_without_intercept(levels) - diff --git a/patsy/desc.py b/patsy/desc.py index 4545de0..6f9d1af 100644 --- a/patsy/desc.py +++ b/patsy/desc.py @@ -16,6 +16,7 @@ # These are made available in the patsy.* namespace __all__ = ["Term", "ModelDesc", "INTERCEPT"] + # One might think it would make more sense for 'factors' to be a set, rather # than a tuple-with-guaranteed-unique-entries-that-compares-like-a-set. The # reason we do it this way is that it preserves the order that the user typed @@ -32,17 +33,19 @@ class Term(object): Terms are hashable and compare by value. Attributes: - + .. attribute:: factors A tuple of factor objects. """ + def __init__(self, factors): self.factors = tuple(uniqueify_list(factors)) def __eq__(self, other): - return (isinstance(other, Term) - and frozenset(other.factors) == frozenset(self.factors)) + return isinstance(other, Term) and frozenset(other.factors) == frozenset( + self.factors + ) def __ne__(self, other): return not self == other @@ -51,6 +54,7 @@ def __hash__(self): return hash((Term, frozenset(self.factors))) __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): assert not cycle repr_pretty_impl(p, self, [list(self.factors)]) @@ -64,8 +68,10 @@ def name(self): __getstate__ = no_pickling + INTERCEPT = Term([]) + class _MockFactor(object): def __init__(self, name): self._name = name @@ -73,6 +79,7 @@ def __init__(self, name): def name(self): return self._name + def test_Term(): assert Term([1, 2, 1]).factors == (1, 2) assert Term([1, 2]) == Term([2, 1]) @@ -85,6 +92,7 @@ def test_Term(): assert_no_pickling(Term([])) + class ModelDesc(object): """A simple container representing the termlists parsed from a formula. @@ -103,17 +111,21 @@ class ModelDesc(object): Two termlists representing the left- and right-hand sides of a formula, suitable for passing to :func:`design_matrix_builders`. """ + def __init__(self, lhs_termlist, rhs_termlist): self.lhs_termlist = uniqueify_list(lhs_termlist) self.rhs_termlist = uniqueify_list(rhs_termlist) __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): assert not cycle - return repr_pretty_impl(p, self, - [], - [("lhs_termlist", self.lhs_termlist), - ("rhs_termlist", self.rhs_termlist)]) + return repr_pretty_impl( + p, + self, + [], + [("lhs_termlist", self.lhs_termlist), ("rhs_termlist", self.rhs_termlist)], + ) def describe(self): """Returns a human-readable representation of this :class:`ModelDesc` @@ -125,11 +137,13 @@ def describe(self): was created by parsing a formula, then it should work in practice. If you *really* have to. """ + def term_code(term): if term == INTERCEPT: return "1" else: return term.name() + result = " + ".join([term_code(term) for term in self.lhs_termlist]) if result: result += " ~ " @@ -141,11 +155,12 @@ def term_code(term): term_names = [] if INTERCEPT not in self.rhs_termlist: term_names.append("0") - term_names += [term_code(term) for term in self.rhs_termlist - if term != INTERCEPT] + term_names += [ + term_code(term) for term in self.rhs_termlist if term != INTERCEPT + ] result += " + ".join(term_names) return result - + @classmethod def from_formula(cls, tree_or_string): """Construct a :class:`ModelDesc` from a formula string. @@ -165,6 +180,7 @@ def from_formula(cls, tree_or_string): __getstate__ = no_pickling + def test_ModelDesc(): f1 = _MockFactor("a") f2 = _MockFactor("b") @@ -179,52 +195,63 @@ def test_ModelDesc(): assert ModelDesc([], []).describe() == "~ 0" assert ModelDesc([INTERCEPT], []).describe() == "1 ~ 0" assert ModelDesc([INTERCEPT], [INTERCEPT]).describe() == "1 ~ 1" - assert (ModelDesc([INTERCEPT], [INTERCEPT, Term([f2])]).describe() - == "1 ~ b") + assert ModelDesc([INTERCEPT], [INTERCEPT, Term([f2])]).describe() == "1 ~ b" + def test_ModelDesc_from_formula(): for input in ("y ~ x", parse_formula("y ~ x")): md = ModelDesc.from_formula(input) - assert md.lhs_termlist == [Term([EvalFactor("y")]),] + assert md.lhs_termlist == [ + Term([EvalFactor("y")]), + ] assert md.rhs_termlist == [INTERCEPT, Term([EvalFactor("x")])] + class IntermediateExpr(object): "This class holds an intermediate result while we're evaluating a tree." + def __init__(self, intercept, intercept_origin, intercept_removed, terms): self.intercept = intercept self.intercept_origin = intercept_origin - self.intercept_removed =intercept_removed + self.intercept_removed = intercept_removed self.terms = tuple(uniqueify_list(terms)) if self.intercept: assert self.intercept_origin assert not (self.intercept and self.intercept_removed) __repr__ = repr_pretty_delegate - def _pretty_repr_(self, p, cycle): # pragma: no cover + + def _pretty_repr_(self, p, cycle): # pragma: no cover assert not cycle - return repr_pretty_impl(p, self, - [self.intercept, self.intercept_origin, - self.intercept_removed, self.terms]) + return repr_pretty_impl( + p, + self, + [self.intercept, self.intercept_origin, self.intercept_removed, self.terms], + ) __getstate__ = no_pickling + def _maybe_add_intercept(doit, terms): if doit: return (INTERCEPT,) + terms else: return terms + def _eval_any_tilde(evaluator, tree): - exprs = [evaluator.eval(arg) for arg in tree.args] + exprs = [evaluator.eval(arg) for arg in tree.args] if len(exprs) == 1: # Formula was like: "~ foo" # We pretend that instead it was like: "0 ~ foo" exprs.insert(0, IntermediateExpr(False, None, True, [])) assert len(exprs) == 2 # Note that only the RHS gets an implicit intercept: - return ModelDesc(_maybe_add_intercept(exprs[0].intercept, exprs[0].terms), - _maybe_add_intercept(not exprs[1].intercept_removed, - exprs[1].terms)) + return ModelDesc( + _maybe_add_intercept(exprs[0].intercept, exprs[0].terms), + _maybe_add_intercept(not exprs[1].intercept_removed, exprs[1].terms), + ) + def _eval_binary_plus(evaluator, tree): left_expr = evaluator.eval(tree.args[0]) @@ -233,38 +260,48 @@ def _eval_binary_plus(evaluator, tree): else: right_expr = evaluator.eval(tree.args[1]) if right_expr.intercept: - return IntermediateExpr(True, right_expr.intercept_origin, False, - left_expr.terms + right_expr.terms) + return IntermediateExpr( + True, + right_expr.intercept_origin, + False, + left_expr.terms + right_expr.terms, + ) else: - return IntermediateExpr(left_expr.intercept, - left_expr.intercept_origin, - left_expr.intercept_removed, - left_expr.terms + right_expr.terms) - + return IntermediateExpr( + left_expr.intercept, + left_expr.intercept_origin, + left_expr.intercept_removed, + left_expr.terms + right_expr.terms, + ) + def _eval_binary_minus(evaluator, tree): left_expr = evaluator.eval(tree.args[0]) if tree.args[1].type == "ZERO": - return IntermediateExpr(True, tree.args[1], False, - left_expr.terms) + return IntermediateExpr(True, tree.args[1], False, left_expr.terms) elif tree.args[1].type == "ONE": return IntermediateExpr(False, None, True, left_expr.terms) else: right_expr = evaluator.eval(tree.args[1]) - terms = [term for term in left_expr.terms - if term not in right_expr.terms] + terms = [term for term in left_expr.terms if term not in right_expr.terms] if right_expr.intercept: return IntermediateExpr(False, None, True, terms) else: - return IntermediateExpr(left_expr.intercept, - left_expr.intercept_origin, - left_expr.intercept_removed, - terms) + return IntermediateExpr( + left_expr.intercept, + left_expr.intercept_origin, + left_expr.intercept_removed, + terms, + ) + def _check_interactable(expr): if expr.intercept: - raise PatsyError("intercept term cannot interact with " - "anything else", expr.intercept_origin) + raise PatsyError( + "intercept term cannot interact with " "anything else", + expr.intercept_origin, + ) + def _interaction(left_expr, right_expr): for expr in (left_expr, right_expr): @@ -275,12 +312,13 @@ def _interaction(left_expr, right_expr): terms.append(Term(l_term.factors + r_term.factors)) return IntermediateExpr(False, None, False, terms) + def _eval_binary_prod(evaluator, tree): exprs = [evaluator.eval(arg) for arg in tree.args] - return IntermediateExpr(False, None, False, - exprs[0].terms - + exprs[1].terms - + _interaction(*exprs).terms) + return IntermediateExpr( + False, None, False, exprs[0].terms + exprs[1].terms + _interaction(*exprs).terms + ) + # Division (nesting) is right-ward distributive: # a / (b + c) -> a/b + a/c -> a + a:b + a:c @@ -299,16 +337,17 @@ def _eval_binary_div(evaluator, tree): left_factors = [] for term in left_expr.terms: left_factors += list(term.factors) - left_combined_expr = IntermediateExpr(False, None, False, - [Term(left_factors)]) + left_combined_expr = IntermediateExpr(False, None, False, [Term(left_factors)]) # Then interact it with everything on the right: terms += list(_interaction(left_combined_expr, right_expr).terms) return IntermediateExpr(False, None, False, terms) + def _eval_binary_interact(evaluator, tree): exprs = [evaluator.eval(arg) for arg in tree.args] return _interaction(*exprs) + def _eval_binary_power(evaluator, tree): left_expr = evaluator.eval(tree.args[0]) _check_interactable(left_expr) @@ -330,9 +369,11 @@ def _eval_binary_power(evaluator, tree): all_terms = all_terms + big_expr.terms return IntermediateExpr(False, None, False, all_terms) + def _eval_unary_plus(evaluator, tree): return evaluator.eval(tree.args[0]) + def _eval_unary_minus(evaluator, tree): if tree.args[0].type == "ZERO": return IntermediateExpr(True, tree.origin, False, []) @@ -341,20 +382,24 @@ def _eval_unary_minus(evaluator, tree): else: raise PatsyError("Unary minus can only be applied to 1 or 0", tree) + def _eval_zero(evaluator, tree): return IntermediateExpr(False, None, True, []) - + + def _eval_one(evaluator, tree): return IntermediateExpr(True, tree.origin, False, []) + def _eval_number(evaluator, tree): - raise PatsyError("numbers besides '0' and '1' are " - "only allowed with **", tree) + raise PatsyError("numbers besides '0' and '1' are " "only allowed with **", tree) + def _eval_python_expr(evaluator, tree): factor = EvalFactor(tree.token.extra, origin=tree.origin) return IntermediateExpr(False, None, False, [Term([factor])]) + class Evaluator(object): def __init__(self): self._evaluators = {} @@ -391,21 +436,26 @@ def eval(self, tree, require_evalexpr=True): assert isinstance(tree, ParseNode) key = (tree.type, len(tree.args)) if key not in self._evaluators: - raise PatsyError("I don't know how to evaluate this " - "'%s' operator" % (tree.type,), - tree.token) + raise PatsyError( + "I don't know how to evaluate this " "'%s' operator" % (tree.type,), + tree.token, + ) result = self._evaluators[key](self, tree) if require_evalexpr and not isinstance(result, IntermediateExpr): if isinstance(result, ModelDesc): - raise PatsyError("~ can only be used once, and " - "only at the top level", - tree) + raise PatsyError( + "~ can only be used once, and " "only at the top level", tree + ) else: - raise PatsyError("custom operator returned an " - "object that I don't know how to " - "handle", tree) + raise PatsyError( + "custom operator returned an " + "object that I don't know how to " + "handle", + tree, + ) return result + ############# _eval_tests = { @@ -413,7 +463,6 @@ def eval(self, tree, require_evalexpr=True): " ": (True, []), " \n ": (True, []), "a": (True, ["a"]), - "1": (True, []), "0": (False, []), "- 1": (False, []), @@ -424,30 +473,23 @@ def eval(self, tree, require_evalexpr=True): "1 + 0": (False, []), "1 - 0": (True, []), "0 - 1": (False, []), - "1 + a": (True, ["a"]), "0 + a": (False, ["a"]), "a - 1": (False, ["a"]), "a - 0": (True, ["a"]), "1 - a": (True, []), - "a + b": (True, ["a", "b"]), "(a + b)": (True, ["a", "b"]), "a + ((((b))))": (True, ["a", "b"]), "a + ((((+b))))": (True, ["a", "b"]), "a + ((((b - a))))": (True, ["a", "b"]), - "a + a + a": (True, ["a"]), - "a + (b - a)": (True, ["a", "b"]), - "a + np.log(a, base=10)": (True, ["a", "np.log(a, base=10)"]), # Note different spacing: "a + np.log(a, base=10) - np . log(a , base = 10)": (True, ["a"]), - "a + (I(b) + c)": (True, ["a", "I(b)", "c"]), "a + I(b + c)": (True, ["a", "I(b + c)"]), - "a:b": (True, [("a", "b")]), "a:b:a": (True, [("a", "b")]), "a:(b + c)": (True, [("a", "b"), ("a", "c")]), @@ -456,13 +498,10 @@ def eval(self, tree, require_evalexpr=True): "c + a:c + a:(b - c)": (True, ["c", ("a", "c"), ("a", "b")]), "(a - b):c": (True, [("a", "c")]), "b + b:c + (a - b):c": (True, ["b", ("b", "c"), ("a", "c")]), - "a:b - a:b": (True, []), "a:b - b:a": (True, []), - "1 - (a + b)": (True, []), "a + b - (a + b)": (True, []), - "a * b": (True, ["a", "b", ("a", "b")]), "a * b * a": (True, ["a", "b", ("a", "b")]), "a * (b + c)": (True, ["a", "b", "c", ("a", "b"), ("a", "c")]), @@ -471,29 +510,50 @@ def eval(self, tree, require_evalexpr=True): "c + a:c + a * (b - c)": (True, ["c", ("a", "c"), "a", "b", ("a", "b")]), "(a - b) * c": (True, ["a", "c", ("a", "c")]), "b + b:c + (a - b) * c": (True, ["b", ("b", "c"), "a", "c", ("a", "c")]), - "a/b": (True, ["a", ("a", "b")]), "(a + b)/c": (True, ["a", "b", ("a", "b", "c")]), "b + b:c + (a - b)/c": (True, ["b", ("b", "c"), "a", ("a", "c")]), "a/(b + c)": (True, ["a", ("a", "b"), ("a", "c")]), - "a ** 2": (True, ["a"]), - "(a + b + c + d) ** 2": (True, ["a", "b", "c", "d", - ("a", "b"), ("a", "c"), ("a", "d"), - ("b", "c"), ("b", "d"), ("c", "d")]), - "(a + b + c + d) ** 3": (True, ["a", "b", "c", "d", - ("a", "b"), ("a", "c"), ("a", "d"), - ("b", "c"), ("b", "d"), ("c", "d"), - ("a", "b", "c"), ("a", "b", "d"), - ("a", "c", "d"), ("b", "c", "d")]), - + "(a + b + c + d) ** 2": ( + True, + [ + "a", + "b", + "c", + "d", + ("a", "b"), + ("a", "c"), + ("a", "d"), + ("b", "c"), + ("b", "d"), + ("c", "d"), + ], + ), + "(a + b + c + d) ** 3": ( + True, + [ + "a", + "b", + "c", + "d", + ("a", "b"), + ("a", "c"), + ("a", "d"), + ("b", "c"), + ("b", "d"), + ("c", "d"), + ("a", "b", "c"), + ("a", "b", "d"), + ("a", "c", "d"), + ("b", "c", "d"), + ], + ), "a + +a": (True, ["a"]), - "~ a + b": (True, ["a", "b"]), "~ a*b": (True, ["a", "b", ("a", "b")]), "~ a*b + 0": (False, ["a", "b", ("a", "b")]), "~ -1": (False, []), - "0 ~ a + b": (True, ["a", "b"]), "1 ~ a + b": (True, [], True, ["a", "b"]), "y ~ a + b": (False, ["y"], True, ["a", "b"]), @@ -501,7 +561,6 @@ def eval(self, tree, require_evalexpr=True): "0 + y * z ~ a + b": (False, ["y", "z", ("y", "z")], True, ["a", "b"]), "-1 ~ 1": (False, [], True, []), "1 + y ~ a + b": (True, ["y"], True, ["a", "b"]), - # Check precedence: "a + b * c": (True, ["a", "b", "c", ("b", "c")]), "a * b + c": (True, ["a", "b", ("a", "b"), "c"]), @@ -510,15 +569,14 @@ def eval(self, tree, require_evalexpr=True): "a / b + c": (True, ["a", ("a", "b"), "c"]), "a*b:c": (True, ["a", ("b", "c"), ("a", "b", "c")]), "a:b*c": (True, [("a", "b"), "c", ("a", "b", "c")]), - # Intercept handling: "~ 1 + 1 + 0 + 1": (True, []), "~ 0 + 1 + 0": (False, []), "~ 0 - 1 - 1 + 0 + 1": (True, []), "~ 1 - 1": (False, []), "~ 0 + a + 1": (True, ["a"]), - "~ 1 + (a + 0)": (True, ["a"]), # This is correct, but perhaps surprising! - "~ 0 + (a + 1)": (True, ["a"]), # Also correct! + "~ 1 + (a + 0)": (True, ["a"]), # This is correct, but perhaps surprising! + "~ 0 + (a + 1)": (True, ["a"]), # Also correct! "~ 1 - (a + 1)": (False, []), } @@ -526,60 +584,46 @@ def eval(self, tree, require_evalexpr=True): _eval_error_tests = [ "a <+>", "a + <(>", - "b + <(-a)>", - "a:<1>", "(a + <1>)*b", - "a + <2>", "a + <1.0>", # eh, catching this is a hassle, we'll just leave the user some rope if # they really want it: - #"a + <0x1>", - + # "a + <0x1>", "a ** ", "a ** <(1 + 1)>", "a ** <1.5>", - "a + b <# asdf>", - "<)>", "a + <)>", "<*> a", "a + <*>", - "a + ", "a + ", "a + ", - "a + <[bar>", "a + <{bar>", - "a + <{bar[]>", - "a + foo<]>bar", "a + foo[]<]>bar", "a + foo{}<}>bar", "a + foo<)>bar", - "a + b<)>", "(a) <.>", - "<(>a + b", - " ~ b", "y ~ <(a ~ b)>", "<~ a> ~ b", "~ <(a ~ b)>", - "1 + <-(a + b)>", - "<- a>", "a + <-a**2>", ] -def _assert_terms_match(terms, expected_intercept, expecteds): # pragma: no cover + +def _assert_terms_match(terms, expected_intercept, expecteds): # pragma: no cover if expected_intercept: expecteds = [()] + expecteds assert len(terms) == len(expecteds) @@ -591,7 +635,8 @@ def _assert_terms_match(terms, expected_intercept, expecteds): # pragma: no cove else: assert term == expected -def _do_eval_formula_tests(tests): # pragma: no cover + +def _do_eval_formula_tests(tests): # pragma: no cover for code, result in tests.items(): if len(result) == 2: result = (False, []) + result @@ -600,24 +645,24 @@ def _do_eval_formula_tests(tests): # pragma: no cover print(result) print(model_desc) lhs_intercept, lhs_termlist, rhs_intercept, rhs_termlist = result - _assert_terms_match(model_desc.lhs_termlist, - lhs_intercept, lhs_termlist) - _assert_terms_match(model_desc.rhs_termlist, - rhs_intercept, rhs_termlist) + _assert_terms_match(model_desc.lhs_termlist, lhs_intercept, lhs_termlist) + _assert_terms_match(model_desc.rhs_termlist, rhs_intercept, rhs_termlist) + def test_eval_formula(): _do_eval_formula_tests(_eval_tests) + def test_eval_formula_error_reporting(): from patsy.parse_formula import _parsing_error_test + parse_fn = lambda formula: ModelDesc.from_formula(formula) _parsing_error_test(parse_fn, _eval_error_tests) + def test_formula_factor_origin(): from patsy.origin import Origin + desc = ModelDesc.from_formula("a + b") - assert (desc.rhs_termlist[1].factors[0].origin - == Origin("a + b", 0, 1)) - assert (desc.rhs_termlist[2].factors[0].origin - == Origin("a + b", 4, 5)) - + assert desc.rhs_termlist[1].factors[0].origin == Origin("a + b", 0, 1) + assert desc.rhs_termlist[2].factors[0].origin == Origin("a + b", 4, 5) diff --git a/patsy/design_info.py b/patsy/design_info.py index e27d382..12a9510 100644 --- a/patsy/design_info.py +++ b/patsy/design_info.py @@ -27,13 +27,18 @@ from patsy import PatsyError from patsy.util import atleast_2d_column_default from patsy.compat import OrderedDict -from patsy.util import (repr_pretty_delegate, repr_pretty_impl, - safe_issubdtype, - no_pickling, assert_no_pickling) +from patsy.util import ( + repr_pretty_delegate, + repr_pretty_impl, + safe_issubdtype, + no_pickling, + assert_no_pickling, +) from patsy.constraint import linear_constraint from patsy.contrasts import ContrastMatrix from patsy.desc import ModelDesc, Term + class FactorInfo: """A FactorInfo object is a simple class that provides some metadata about the role of a factor within a model. :attr:`DesignInfo.factor_infos` is @@ -71,46 +76,49 @@ class FactorInfo: ``None``. """ - def __init__(self, factor, type, state, - num_columns=None, categories=None): + def __init__(self, factor, type, state, num_columns=None, categories=None): self.factor = factor self.type = type if self.type not in ["numerical", "categorical"]: - raise ValueError("FactorInfo.type must be " - "'numerical' or 'categorical', not %r" - % (self.type,)) + raise ValueError( + "FactorInfo.type must be " + "'numerical' or 'categorical', not %r" % (self.type,) + ) self.state = state if self.type == "numerical": if not isinstance(num_columns, int): - raise ValueError("For numerical factors, num_columns " - "must be an integer") + raise ValueError( + "For numerical factors, num_columns " "must be an integer" + ) if categories is not None: - raise ValueError("For numerical factors, categories " - "must be None") + raise ValueError("For numerical factors, categories " "must be None") else: assert self.type == "categorical" if num_columns is not None: - raise ValueError("For categorical factors, num_columns " - "must be None") + raise ValueError("For categorical factors, num_columns " "must be None") categories = tuple(categories) self.num_columns = num_columns self.categories = categories __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): assert not cycle + class FactorState(object): def __repr__(self): return "" - kwlist = [("factor", self.factor), - ("type", self.type), - # Don't put the state in people's faces, it will - # just encourage them to pay attention to the - # contents :-). Plus it's a bunch of gobbledygook - # they don't care about. They can always look at - # self.state if they want to know... - ("state", FactorState()), - ] + + kwlist = [ + ("factor", self.factor), + ("type", self.type), + # Don't put the state in people's faces, it will + # just encourage them to pay attention to the + # contents :-). Plus it's a bunch of gobbledygook + # they don't care about. They can always look at + # self.state if they want to know... + ("state", FactorState()), + ] if self.type == "numerical": kwlist.append(("num_columns", self.num_columns)) else: @@ -119,6 +127,7 @@ def __repr__(self): __getstate__ = no_pickling + def test_FactorInfo(): fi1 = FactorInfo("asdf", "numerical", {"a": 1}, num_columns=10) assert fi1.factor == "asdf" @@ -141,19 +150,18 @@ def test_FactorInfo(): repr(fi2) import pytest + pytest.raises(ValueError, FactorInfo, "asdf", "non-numerical", {}) pytest.raises(ValueError, FactorInfo, "asdf", "numerical", {}) - pytest.raises(ValueError, FactorInfo, "asdf", "numerical", {}, - num_columns="asdf") - pytest.raises(ValueError, FactorInfo, "asdf", "numerical", {}, - num_columns=1, categories=1) + pytest.raises(ValueError, FactorInfo, "asdf", "numerical", {}, num_columns="asdf") + pytest.raises( + ValueError, FactorInfo, "asdf", "numerical", {}, num_columns=1, categories=1 + ) pytest.raises(TypeError, FactorInfo, "asdf", "categorical", {}) - pytest.raises(ValueError, FactorInfo, "asdf", "categorical", {}, - num_columns=1) - pytest.raises(TypeError, FactorInfo, "asdf", "categorical", {}, - categories=1) + pytest.raises(ValueError, FactorInfo, "asdf", "categorical", {}, num_columns=1) + pytest.raises(TypeError, FactorInfo, "asdf", "categorical", {}, categories=1) class SubtermInfo: @@ -210,23 +218,32 @@ def __init__(self, factors, contrast_matrices, num_columns): if factor not in factor_set: raise ValueError("Unexpected factor in contrast_matrices dict") if not isinstance(contrast_matrix, ContrastMatrix): - raise ValueError("Expected a ContrastMatrix, not %r" - % (contrast_matrix,)) + raise ValueError( + "Expected a ContrastMatrix, not %r" % (contrast_matrix,) + ) self.contrast_matrices = contrast_matrices if not isinstance(num_columns, int): raise ValueError("num_columns must be an integer") self.num_columns = num_columns __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): assert not cycle - repr_pretty_impl(p, self, [], - [("factors", self.factors), - ("contrast_matrices", self.contrast_matrices), - ("num_columns", self.num_columns)]) + repr_pretty_impl( + p, + self, + [], + [ + ("factors", self.factors), + ("contrast_matrices", self.contrast_matrices), + ("num_columns", self.num_columns), + ], + ) __getstate__ = no_pickling + def test_SubtermInfo(): cm = ContrastMatrix(np.ones((2, 2)), ["[1]", "[2]"]) s = SubtermInfo(["a", "x"], {"a": cm}, 4) @@ -238,12 +255,14 @@ def test_SubtermInfo(): repr(s) import pytest + pytest.raises(TypeError, SubtermInfo, 1, {}, 1) pytest.raises(ValueError, SubtermInfo, ["a", "x"], 1, 1) pytest.raises(ValueError, SubtermInfo, ["a", "x"], {"z": cm}, 1) pytest.raises(ValueError, SubtermInfo, ["a", "x"], {"a": 1}, 1) pytest.raises(ValueError, SubtermInfo, ["a", "x"], {}, 1.5) + class DesignInfo(object): """A DesignInfo object holds metadata about a design matrix. @@ -254,14 +273,16 @@ class DesignInfo(object): """ - def __init__(self, column_names, - factor_infos=None, term_codings=None): - self.column_name_indexes = OrderedDict(zip(column_names, - range(len(column_names)))) + def __init__(self, column_names, factor_infos=None, term_codings=None): + self.column_name_indexes = OrderedDict( + zip(column_names, range(len(column_names))) + ) if (factor_infos is None) != (term_codings is None): - raise ValueError("Must specify either both or neither of " - "factor_infos= and term_codings=") + raise ValueError( + "Must specify either both or neither of " + "factor_infos= and term_codings=" + ) self.factor_infos = factor_infos self.term_codings = term_codings @@ -283,8 +304,7 @@ def __init__(self, column_names, term_factors = set(term.factors) for subterm in subterms: if not isinstance(subterm, SubtermInfo): - raise ValueError("expected SubtermInfo, " - "not %r" % (subterm,)) + raise ValueError("expected SubtermInfo, " "not %r" % (subterm,)) if not term_factors.issuperset(subterm.factors): raise ValueError("unexpected factors in subterm") @@ -292,12 +312,14 @@ def __init__(self, column_names, for term in self.term_codings: all_factors.update(term.factors) if all_factors != set(self.factor_infos): - raise ValueError("Provided Term objects and factor_infos " - "do not match") + raise ValueError( + "Provided Term objects and factor_infos " "do not match" + ) for factor, factor_info in self.factor_infos.items(): if not isinstance(factor_info, FactorInfo): - raise ValueError("expected FactorInfo object, not %r" - % (factor_info,)) + raise ValueError( + "expected FactorInfo object, not %r" % (factor_info,) + ) if factor != factor_info.factor: raise ValueError("mismatched factor_info.factor") @@ -313,13 +335,17 @@ def __init__(self, column_names, assert fi.type == "categorical" cm = subterm.contrast_matrices[factor].matrix if cm.shape[0] != len(fi.categories): - raise ValueError("Mismatched contrast matrix " - "for factor %r" % (factor,)) + raise ValueError( + "Mismatched contrast matrix " + "for factor %r" % (factor,) + ) cat_factors.add(factor) exp_cols *= cm.shape[1] if cat_factors != set(subterm.contrast_matrices): - raise ValueError("Mismatch between contrast_matrices " - "and categorical factors") + raise ValueError( + "Mismatch between contrast_matrices " + "and categorical factors" + ) if exp_cols != subterm.num_columns: raise ValueError("Unexpected num_columns") @@ -341,11 +367,12 @@ def __init__(self, column_names, self.term_slices[term] = slice(idx, idx + term_columns) idx += term_columns if idx != len(self.column_names): - raise ValueError("mismatch between column_names and columns " - "coded by given terms") + raise ValueError( + "mismatch between column_names and columns " "coded by given terms" + ) self.term_name_slices = OrderedDict( - [(term.name(), slice_) - for (term, slice_) in self.term_slices.items()]) + [(term.name(), slice_) for (term, slice_) in self.term_slices.items()] + ) # Guarantees: # term_name_slices is never None @@ -356,8 +383,9 @@ def __init__(self, column_names, # term_name_slices. assert self.term_name_slices is not None if self.term_slices is not None: - assert (list(self.term_slices.values()) - == list(self.term_name_slices.values())) + assert list(self.term_slices.values()) == list( + self.term_name_slices.values() + ) # These checks probably aren't necessary anymore now that we always # generate the slices ourselves, but we'll leave them in just to be # safe. @@ -377,12 +405,15 @@ def __init__(self, column_names, raise ValueError("term/column name collision") __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): assert not cycle - repr_pretty_impl(p, self, - [self.column_names], - [("factor_infos", self.factor_infos), - ("term_codings", self.term_codings)]) + repr_pretty_impl( + p, + self, + [self.column_names], + [("factor_infos", self.factor_infos), ("term_codings", self.term_codings)], + ) @property def column_names(self): @@ -404,22 +435,30 @@ def term_names(self): @property def builder(self): ".. deprecated:: 0.4.0" - warnings.warn(DeprecationWarning( - "The DesignInfo.builder attribute is deprecated starting in " - "patsy v0.4.0; distinct builder objects have been eliminated " - "and design_info.builder is now just a long-winded way of " - "writing 'design_info' (i.e. the .builder attribute just " - "returns self)"), stacklevel=2) + warnings.warn( + DeprecationWarning( + "The DesignInfo.builder attribute is deprecated starting in " + "patsy v0.4.0; distinct builder objects have been eliminated " + "and design_info.builder is now just a long-winded way of " + "writing 'design_info' (i.e. the .builder attribute just " + "returns self)" + ), + stacklevel=2, + ) return self @property def design_info(self): ".. deprecated:: 0.4.0" - warnings.warn(DeprecationWarning( - "Starting in patsy v0.4.0, the DesignMatrixBuilder class has " - "been merged into the DesignInfo class. So there's no need to " - "use builder.design_info to access the DesignInfo; 'builder' " - "already *is* a DesignInfo."), stacklevel=2) + warnings.warn( + DeprecationWarning( + "Starting in patsy v0.4.0, the DesignMatrixBuilder class has " + "been merged into the DesignInfo class. So there's no need to " + "use builder.design_info to access the DesignInfo; 'builder' " + "already *is* a DesignInfo." + ), + stacklevel=2, + ) return self def slice(self, columns_specifier): @@ -459,16 +498,14 @@ def slice(self, columns_specifier): return columns_specifier if np.issubdtype(type(columns_specifier), np.integer): return slice(columns_specifier, columns_specifier + 1) - if (self.term_slices is not None - and columns_specifier in self.term_slices): + if self.term_slices is not None and columns_specifier in self.term_slices: return self.term_slices[columns_specifier] if columns_specifier in self.term_name_slices: return self.term_name_slices[columns_specifier] if columns_specifier in self.column_name_indexes: idx = self.column_name_indexes[columns_specifier] return slice(idx, idx + 1) - raise PatsyError("unknown column specified '%s'" - % (columns_specifier,)) + raise PatsyError("unknown column specified '%s'" % (columns_specifier,)) def linear_constraint(self, constraint_likes): """Construct a linear constraint in matrix form from a (possibly @@ -641,9 +678,11 @@ def subset(self, which_terms): for f in term.factors: new_factor_infos[f] = self.factor_infos[f] new_term_codings[term] = self.term_codings[term] - return DesignInfo(new_column_names, - factor_infos=new_factor_infos, - term_codings=new_term_codings) + return DesignInfo( + new_column_names, + factor_infos=new_factor_infos, + term_codings=new_term_codings, + ) @classmethod def from_array(cls, array_like, default_column_prefix="column"): @@ -663,41 +702,44 @@ def from_array(cls, array_like, default_column_prefix="column"): then this will be used to construct them. :returns: a DesignInfo object """ - if hasattr(array_like, "design_info") and isinstance(array_like.design_info, cls): + if hasattr(array_like, "design_info") and isinstance( + array_like.design_info, cls + ): return array_like.design_info arr = atleast_2d_column_default(array_like, preserve_pandas=True) if arr.ndim > 2: raise ValueError("design matrix can't have >2 dimensions") columns = getattr(arr, "columns", range(arr.shape[1])) - if (hasattr(columns, "dtype") - and not safe_issubdtype(columns.dtype, np.integer)): + if hasattr(columns, "dtype") and not safe_issubdtype(columns.dtype, np.integer): column_names = [str(obj) for obj in columns] else: - column_names = ["%s%s" % (default_column_prefix, i) - for i in columns] + column_names = ["%s%s" % (default_column_prefix, i) for i in columns] return DesignInfo(column_names) __getstate__ = no_pickling + def test_DesignInfo(): import pytest + class _MockFactor(object): def __init__(self, name): self._name = name def name(self): return self._name + f_x = _MockFactor("x") f_y = _MockFactor("y") t_x = Term([f_x]) t_y = Term([f_y]) - factor_infos = {f_x: - FactorInfo(f_x, "numerical", {}, num_columns=3), - f_y: - FactorInfo(f_y, "numerical", {}, num_columns=1), - } - term_codings = OrderedDict([(t_x, [SubtermInfo([f_x], {}, 3)]), - (t_y, [SubtermInfo([f_y], {}, 1)])]) + factor_infos = { + f_x: FactorInfo(f_x, "numerical", {}, num_columns=3), + f_y: FactorInfo(f_y, "numerical", {}, num_columns=1), + } + term_codings = OrderedDict( + [(t_x, [SubtermInfo([f_x], {}, 3)]), (t_y, [SubtermInfo([f_y], {}, 1)])] + ) di = DesignInfo(["x1", "x2", "x3", "y"], factor_infos, term_codings) assert di.column_names == ["x1", "x2", "x3", "y"] assert di.term_names == ["x", "y"] @@ -729,10 +771,12 @@ def name(self): assert di.term_names == ["a1", "a2", "a3", "b"] assert di.terms is None assert di.column_name_indexes == {"a1": 0, "a2": 1, "a3": 2, "b": 3} - assert di.term_name_slices == {"a1": slice(0, 1), - "a2": slice(1, 2), - "a3": slice(2, 3), - "b": slice(3, 4)} + assert di.term_name_slices == { + "a1": slice(0, 1), + "a2": slice(1, 2), + "a3": slice(2, 3), + "b": slice(3, 4), + } assert di.term_slices is None assert di.describe() == "a1 + a2 + a3 + b" @@ -747,137 +791,211 @@ def name(self): # Failure modes # must specify either both or neither of factor_infos and term_codings: - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], factor_infos=factor_infos) - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], term_codings=term_codings) + pytest.raises( + ValueError, DesignInfo, ["x1", "x2", "x3", "y"], factor_infos=factor_infos + ) + pytest.raises( + ValueError, DesignInfo, ["x1", "x2", "x3", "y"], term_codings=term_codings + ) # factor_infos must be a dict - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], list(factor_infos), term_codings) + pytest.raises( + ValueError, + DesignInfo, + ["x1", "x2", "x3", "y"], + list(factor_infos), + term_codings, + ) # wrong number of column names: - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y1", "y2"], factor_infos, term_codings) - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3"], factor_infos, term_codings) + pytest.raises( + ValueError, + DesignInfo, + ["x1", "x2", "x3", "y1", "y2"], + factor_infos, + term_codings, + ) + pytest.raises( + ValueError, DesignInfo, ["x1", "x2", "x3"], factor_infos, term_codings + ) # name overlap problems - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "y", "y2"], factor_infos, term_codings) + pytest.raises( + ValueError, DesignInfo, ["x1", "x2", "y", "y2"], factor_infos, term_codings + ) # duplicate name - pytest.raises(ValueError, DesignInfo, - ["x1", "x1", "x1", "y"], factor_infos, term_codings) + pytest.raises( + ValueError, DesignInfo, ["x1", "x1", "x1", "y"], factor_infos, term_codings + ) # f_y is in factor_infos, but not mentioned in any term term_codings_x_only = OrderedDict(term_codings) del term_codings_x_only[t_y] - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3"], factor_infos, term_codings_x_only) + pytest.raises( + ValueError, DesignInfo, ["x1", "x2", "x3"], factor_infos, term_codings_x_only + ) # f_a is in a term, but not in factor_infos f_a = _MockFactor("a") t_a = Term([f_a]) term_codings_with_a = OrderedDict(term_codings) term_codings_with_a[t_a] = [SubtermInfo([f_a], {}, 1)] - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y", "a"], - factor_infos, term_codings_with_a) + pytest.raises( + ValueError, + DesignInfo, + ["x1", "x2", "x3", "y", "a"], + factor_infos, + term_codings_with_a, + ) # bad factor_infos not_factor_infos = dict(factor_infos) not_factor_infos[f_x] = "what is this I don't even" - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], not_factor_infos, term_codings) + pytest.raises( + ValueError, DesignInfo, ["x1", "x2", "x3", "y"], not_factor_infos, term_codings + ) mismatch_factor_infos = dict(factor_infos) mismatch_factor_infos[f_x] = FactorInfo(f_a, "numerical", {}, num_columns=3) - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], mismatch_factor_infos, term_codings) + pytest.raises( + ValueError, + DesignInfo, + ["x1", "x2", "x3", "y"], + mismatch_factor_infos, + term_codings, + ) # bad term_codings - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], factor_infos, dict(term_codings)) + pytest.raises( + ValueError, + DesignInfo, + ["x1", "x2", "x3", "y"], + factor_infos, + dict(term_codings), + ) not_term_codings = OrderedDict(term_codings) not_term_codings["this is a string"] = term_codings[t_x] - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], factor_infos, not_term_codings) + pytest.raises( + ValueError, DesignInfo, ["x1", "x2", "x3", "y"], factor_infos, not_term_codings + ) non_list_term_codings = OrderedDict(term_codings) non_list_term_codings[t_y] = tuple(term_codings[t_y]) - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], factor_infos, non_list_term_codings) + pytest.raises( + ValueError, + DesignInfo, + ["x1", "x2", "x3", "y"], + factor_infos, + non_list_term_codings, + ) non_subterm_term_codings = OrderedDict(term_codings) non_subterm_term_codings[t_y][0] = "not a SubtermInfo" - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], factor_infos, non_subterm_term_codings) + pytest.raises( + ValueError, + DesignInfo, + ["x1", "x2", "x3", "y"], + factor_infos, + non_subterm_term_codings, + ) bad_subterm = OrderedDict(term_codings) # f_x is a factor in this model, but it is not a factor in t_y term_codings[t_y][0] = SubtermInfo([f_x], {}, 1) - pytest.raises(ValueError, DesignInfo, - ["x1", "x2", "x3", "y"], factor_infos, bad_subterm) + pytest.raises( + ValueError, DesignInfo, ["x1", "x2", "x3", "y"], factor_infos, bad_subterm + ) # contrast matrix has wrong number of rows - factor_codings_a = {f_a: - FactorInfo(f_a, "categorical", {}, - categories=["a1", "a2"])} - term_codings_a_bad_rows = OrderedDict([ - (t_a, - [SubtermInfo([f_a], - {f_a: ContrastMatrix(np.ones((3, 2)), - ["[1]", "[2]"])}, - 2)])]) - pytest.raises(ValueError, DesignInfo, - ["a[1]", "a[2]"], - factor_codings_a, - term_codings_a_bad_rows) + factor_codings_a = { + f_a: FactorInfo(f_a, "categorical", {}, categories=["a1", "a2"]) + } + term_codings_a_bad_rows = OrderedDict( + [ + ( + t_a, + [ + SubtermInfo( + [f_a], {f_a: ContrastMatrix(np.ones((3, 2)), ["[1]", "[2]"])}, 2 + ) + ], + ) + ] + ) + pytest.raises( + ValueError, + DesignInfo, + ["a[1]", "a[2]"], + factor_codings_a, + term_codings_a_bad_rows, + ) # have a contrast matrix for a non-categorical factor t_ax = Term([f_a, f_x]) - factor_codings_ax = {f_a: - FactorInfo(f_a, "categorical", {}, - categories=["a1", "a2"]), - f_x: - FactorInfo(f_x, "numerical", {}, - num_columns=2)} - term_codings_ax_extra_cm = OrderedDict([ - (t_ax, - [SubtermInfo([f_a, f_x], - {f_a: ContrastMatrix(np.ones((2, 2)), ["[1]", "[2]"]), - f_x: ContrastMatrix(np.ones((2, 2)), ["[1]", "[2]"])}, - 4)])]) - pytest.raises(ValueError, DesignInfo, - ["a[1]:x[1]", "a[2]:x[1]", "a[1]:x[2]", "a[2]:x[2]"], - factor_codings_ax, - term_codings_ax_extra_cm) + factor_codings_ax = { + f_a: FactorInfo(f_a, "categorical", {}, categories=["a1", "a2"]), + f_x: FactorInfo(f_x, "numerical", {}, num_columns=2), + } + term_codings_ax_extra_cm = OrderedDict( + [ + ( + t_ax, + [ + SubtermInfo( + [f_a, f_x], + { + f_a: ContrastMatrix(np.ones((2, 2)), ["[1]", "[2]"]), + f_x: ContrastMatrix(np.ones((2, 2)), ["[1]", "[2]"]), + }, + 4, + ) + ], + ) + ] + ) + pytest.raises( + ValueError, + DesignInfo, + ["a[1]:x[1]", "a[2]:x[1]", "a[1]:x[2]", "a[2]:x[2]"], + factor_codings_ax, + term_codings_ax_extra_cm, + ) # no contrast matrix for a categorical factor - term_codings_ax_missing_cm = OrderedDict([ - (t_ax, - [SubtermInfo([f_a, f_x], - {}, - 4)])]) + term_codings_ax_missing_cm = OrderedDict([(t_ax, [SubtermInfo([f_a, f_x], {}, 4)])]) # This actually fails before it hits the relevant check with a KeyError, # but that's okay... the previous test still exercises the check. - pytest.raises((ValueError, KeyError), DesignInfo, - ["a[1]:x[1]", "a[2]:x[1]", "a[1]:x[2]", "a[2]:x[2]"], - factor_codings_ax, - term_codings_ax_missing_cm) + pytest.raises( + (ValueError, KeyError), + DesignInfo, + ["a[1]:x[1]", "a[2]:x[1]", "a[1]:x[2]", "a[2]:x[2]"], + factor_codings_ax, + term_codings_ax_missing_cm, + ) # subterm num_columns doesn't match the value computed from the individual # factors - term_codings_ax_wrong_subterm_columns = OrderedDict([ - (t_ax, - [SubtermInfo([f_a, f_x], - {f_a: ContrastMatrix(np.ones((2, 3)), - ["[1]", "[2]", "[3]"])}, - # should be 2 * 3 = 6 - 5)])]) - pytest.raises(ValueError, DesignInfo, - ["a[1]:x[1]", "a[2]:x[1]", "a[3]:x[1]", - "a[1]:x[2]", "a[2]:x[2]", "a[3]:x[2]"], - factor_codings_ax, - term_codings_ax_wrong_subterm_columns) + term_codings_ax_wrong_subterm_columns = OrderedDict( + [ + ( + t_ax, + [ + SubtermInfo( + [f_a, f_x], + {f_a: ContrastMatrix(np.ones((2, 3)), ["[1]", "[2]", "[3]"])}, + # should be 2 * 3 = 6 + 5, + ) + ], + ) + ] + ) + pytest.raises( + ValueError, + DesignInfo, + ["a[1]:x[1]", "a[2]:x[1]", "a[3]:x[1]", "a[1]:x[2]", "a[2]:x[2]", "a[3]:x[2]"], + factor_codings_ax, + term_codings_ax_wrong_subterm_columns, + ) + def test_DesignInfo_from_array(): di = DesignInfo.from_array([1, 2, 3]) @@ -886,8 +1004,7 @@ def test_DesignInfo_from_array(): assert di2.column_names == ["column0", "column1"] di3 = DesignInfo.from_array([1, 2, 3], default_column_prefix="x") assert di3.column_names == ["x0"] - di4 = DesignInfo.from_array([[1, 2], [2, 3], [3, 4]], - default_column_prefix="x") + di4 = DesignInfo.from_array([[1, 2], [2, 3], [3, 4]], default_column_prefix="x") assert di4.column_names == ["x0", "x1"] m = DesignMatrix([1, 2, 3], di3) assert DesignInfo.from_array(m) is di3 @@ -897,24 +1014,26 @@ def test_DesignInfo_from_array(): assert di_weird.column_names == ["column0"] import pytest + pytest.raises(ValueError, DesignInfo.from_array, np.ones((2, 2, 2))) from patsy.util import have_pandas + if have_pandas: import pandas + # with named columns - di5 = DesignInfo.from_array(pandas.DataFrame([[1, 2]], - columns=["a", "b"])) + di5 = DesignInfo.from_array(pandas.DataFrame([[1, 2]], columns=["a", "b"])) assert di5.column_names == ["a", "b"] # with irregularly numbered columns - di6 = DesignInfo.from_array(pandas.DataFrame([[1, 2]], - columns=[0, 10])) + di6 = DesignInfo.from_array(pandas.DataFrame([[1, 2]], columns=[0, 10])) assert di6.column_names == ["column0", "column10"] # with .design_info attr df = pandas.DataFrame([[1, 2]]) df.design_info = di6 assert DesignInfo.from_array(df) is di6 + def test_DesignInfo_linear_constraint(): di = DesignInfo(["a1", "a2", "a3", "b"]) con = di.linear_constraint(["2 * a1 = b + 1", "a3"]) @@ -922,17 +1041,21 @@ def test_DesignInfo_linear_constraint(): assert np.all(con.coefs == [[2, 0, 0, -1], [0, 0, 1, 0]]) assert np.all(con.constants == [[1], [0]]) + def test_DesignInfo_deprecated_attributes(): d = DesignInfo(["a1", "a2"]) + def check(attr): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") assert getattr(d, attr) is d assert len(w) == 1 assert w[0].category is DeprecationWarning + check("builder") check("design_info") + # Idea: format with a reasonable amount of precision, then if that turns out # to be higher than necessary, remove as many zeros as we can. But only do # this while we can do it to *all* the ordinarily-formatted numbers, to keep @@ -945,8 +1068,12 @@ def _format_float_column(precision, col): col_strs = np.array([format_str % (x,) for x in col], dtype=object) # Really every item should have a decimal, but just in case, we don't want # to strip zeros off the end of "10" or something like that. - mask = np.array([simple_float_chars.issuperset(col_str) and "." in col_str - for col_str in col_strs]) + mask = np.array( + [ + simple_float_chars.issuperset(col_str) and "." in col_str + for col_str in col_strs + ] + ) mask_idxes = np.nonzero(mask)[0] strip_char = "0" if np.any(mask): @@ -961,11 +1088,13 @@ def _format_float_column(precision, col): break return col_strs + def test__format_float_column(): def t(precision, numbers, expected): got = _format_float_column(precision, np.asarray(numbers)) print(got, expected) assert np.array_equal(got, expected) + # This acts weird on old python versions (e.g. it can be "-nan"), so don't # hardcode it: nan_string = "%.3f" % (np.nan,) @@ -974,6 +1103,7 @@ def t(precision, numbers, expected): t(3, [1.0001, 2, 3, np.nan], ["1", "2", "3", nan_string]) t(4, [1.0001, 2, 3, np.nan], ["1.0001", "2.0000", "3.0000", nan_string]) + # http://docs.scipy.org/doc/numpy/user/basics.subclassing.html#slightly-more-realistic-example-attribute-added-to-existing-array class DesignMatrix(np.ndarray): """A simple numpy array subclass that carries design matrix metadata. @@ -997,8 +1127,7 @@ class DesignMatrix(np.ndarray): present only on "real" DesignMatrix objects. """ - def __new__(cls, input_array, design_info=None, - default_column_prefix="column"): + def __new__(cls, input_array, design_info=None, default_column_prefix="column"): """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix. A call like:: @@ -1022,8 +1151,9 @@ def __new__(cls, input_array, design_info=None, # from turning non-design-matrix arrays into DesignMatrix # instances. (E.g., my_dm.diagonal() will return a DesignMatrix # object, but one without a design_info attribute.) - if (isinstance(input_array, DesignMatrix) - and hasattr(input_array, "design_info")): + if isinstance(input_array, DesignMatrix) and hasattr( + input_array, "design_info" + ): return input_array self = atleast_2d_column_default(input_array).view(cls) # Upcast integer to floating point @@ -1035,15 +1165,17 @@ def __new__(cls, input_array, design_info=None, if design_info is None: design_info = DesignInfo.from_array(self, default_column_prefix) if len(design_info.column_names) != self.shape[1]: - raise ValueError("wrong number of column names for design matrix " - "(got %s, wanted %s)" - % (len(design_info.column_names), self.shape[1])) + raise ValueError( + "wrong number of column names for design matrix " + "(got %s, wanted %s)" % (len(design_info.column_names), self.shape[1]) + ) self.design_info = design_info if not safe_issubdtype(self.dtype, np.floating): raise ValueError("design matrix must be real-valued floating point") return self __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): if not hasattr(self, "design_info"): # Not a real DesignMatrix @@ -1064,26 +1196,32 @@ def _repr_pretty_(self, p, cycle): names = self.design_info.column_names column_name_widths = [len(name) for name in names] - min_total_width = (INDENT + SEP * (self.shape[1] - 1) - + np.sum(column_name_widths)) + min_total_width = ( + INDENT + SEP * (self.shape[1] - 1) + np.sum(column_name_widths) + ) if min_total_width <= MAX_TOTAL_WIDTH: printable_part = np.asarray(self)[:MAX_ROWS, :] - formatted_cols = [_format_float_column(PRECISION, - printable_part[:, i]) - for i in range(self.shape[1])] + formatted_cols = [ + _format_float_column(PRECISION, printable_part[:, i]) + for i in range(self.shape[1]) + ] + def max_width(col): assert col.ndim == 1 if not col.shape[0]: return 0 else: return max([len(s) for s in col]) + column_num_widths = [max_width(col) for col in formatted_cols] - column_widths = [max(name_width, num_width) - for (name_width, num_width) - in zip(column_name_widths, column_num_widths)] - total_width = (INDENT + SEP * (self.shape[1] - 1) - + np.sum(column_widths)) - print_numbers = (total_width < MAX_TOTAL_WIDTH) + column_widths = [ + max(name_width, num_width) + for (name_width, num_width) in zip( + column_name_widths, column_num_widths + ) + ] + total_width = INDENT + SEP * (self.shape[1] - 1) + np.sum(column_widths) + print_numbers = total_width < MAX_TOTAL_WIDTH else: print_numbers = False @@ -1094,8 +1232,7 @@ def max_width(col): sep = " " * SEP # list() is for Py3 compatibility for row in [names] + list(zip(*formatted_cols)): - cells = [cell.rjust(width) - for (width, cell) in zip(column_widths, row)] + cells = [cell.rjust(width) for (width, cell) in zip(column_widths, row)] p.text(sep.join(cells)) p.text("\n" + " " * p.indentation) if MAX_ROWS < self.shape[0]: @@ -1134,6 +1271,7 @@ def max_width(col): __reduce__ = no_pickling + def test_design_matrix(): import pytest @@ -1145,8 +1283,7 @@ def test_design_matrix(): pytest.raises(ValueError, DesignMatrix, [[12, 14, 16, 18]], bad_di) mm2 = DesignMatrix([[12, 14, 16, 18]]) - assert mm2.design_info.column_names == ["column0", "column1", "column2", - "column3"] + assert mm2.design_info.column_names == ["column0", "column1", "column2", "column3"] mm3 = DesignMatrix([12, 14, 16, 18]) assert mm3.shape == (4, 1) diff --git a/patsy/eval.py b/patsy/eval.py index 6b4f5ea..12ce0d4 100644 --- a/patsy/eval.py +++ b/patsy/eval.py @@ -20,10 +20,10 @@ import numbers from patsy import PatsyError from patsy.util import PushbackAdapter, no_pickling, assert_no_pickling -from patsy.tokens import (pretty_untokenize, normalize_token_spacing, - python_tokenize) +from patsy.tokens import pretty_untokenize, normalize_token_spacing, python_tokenize from patsy.compat import call_and_wrap_exc + def _all_future_flags(): flags = 0 for feature_name in __future__.all_feature_names: @@ -36,8 +36,10 @@ def _all_future_flags(): flags |= feature.compiler_flag return flags + _ALL_FUTURE_FLAGS = _all_future_flags() + # This is just a minimal dict-like object that does lookup in a 'stack' of # dicts -- first it checks the first, then the second, etc. Assignments go # into an internal, zeroth dict. @@ -85,6 +87,7 @@ def test_VarLookupDict(): assert "a" in ds assert "c" not in ds import pytest + pytest.raises(KeyError, ds.__getitem__, "c") ds["a"] = 10 assert ds["a"] == 10 @@ -94,6 +97,7 @@ def test_VarLookupDict(): assert_no_pickling(ds) + def ast_names(code): """Iterator that yields all the (ast) names in a Python expression. @@ -106,34 +110,44 @@ def ast_names(code): for node in ast.walk(ast.parse(code)): if isinstance(node, disallowed_ast_nodes): - raise PatsyError("Lambda, list/dict/set comprehension, generator " - "expression in patsy formula not currently supported.") + raise PatsyError( + "Lambda, list/dict/set comprehension, generator " + "expression in patsy formula not currently supported." + ) if isinstance(node, ast.Name): yield node.id + def test_ast_names(): - test_data = [('np.log(x)', ['np', 'x']), - ('x', ['x']), - ('center(x + 1)', ['center', 'x']), - ('dt.date.dt.month', ['dt'])] + test_data = [ + ("np.log(x)", ["np", "x"]), + ("x", ["x"]), + ("center(x + 1)", ["center", "x"]), + ("dt.date.dt.month", ["dt"]), + ] for code, expected in test_data: assert set(ast_names(code)) == set(expected) + def test_ast_names_disallowed_nodes(): import pytest + def list_ast_names(code): return list(ast_names(code)) + pytest.raises(PatsyError, list_ast_names, "lambda x: x + y") pytest.raises(PatsyError, list_ast_names, "[x + 1 for x in range(10)]") pytest.raises(PatsyError, list_ast_names, "(x + 1 for x in range(10))") pytest.raises(PatsyError, list_ast_names, "{x: True for x in range(10)}") pytest.raises(PatsyError, list_ast_names, "{x + 1 for x in range(10)}") + class EvalEnvironment(object): """Represents a Python execution environment. Encapsulates a namespace for variable lookup and set of __future__ flags.""" + def __init__(self, namespaces, flags=0): assert not flags & ~_ALL_FUTURE_FLAGS self._namespaces = list(namespaces) @@ -150,8 +164,7 @@ def with_outer_namespace(self, outer_namespace): This namespace will be used only for variables that are not found in any existing namespace, i.e., it is "outside" them all.""" - return self.__class__(self._namespaces + [outer_namespace], - self.flags) + return self.__class__(self._namespaces + [outer_namespace], self.flags) def eval(self, expr, source_name="", inner_namespace={}): """Evaluate some Python code in the encapsulated environment. @@ -163,8 +176,7 @@ def eval(self, expr, source_name="", inner_namespace={}): :returns: The value of `expr`. """ code = compile(expr, source_name, "eval", self.flags, False) - return eval(code, {}, VarLookupDict([inner_namespace] - + self._namespaces)) + return eval(code, {}, VarLookupDict([inner_namespace] + self._namespaces)) @classmethod def capture(cls, eval_env=0, reference=0): @@ -216,16 +228,20 @@ def my_model(formula_like, data, eval_env=0): elif isinstance(eval_env, numbers.Integral): depth = eval_env + reference else: - raise TypeError("Parameter 'eval_env' must be either an integer " - "or an instance of patsy.EvalEnvironment.") + raise TypeError( + "Parameter 'eval_env' must be either an integer " + "or an instance of patsy.EvalEnvironment." + ) frame = inspect.currentframe() try: for i in range(depth + 1): if frame is None: raise ValueError("call-stack is not that deep!") frame = frame.f_back - return cls([frame.f_locals, frame.f_globals], - frame.f_code.co_flags & _ALL_FUTURE_FLAGS) + return cls( + [frame.f_locals, frame.f_globals], + frame.f_code.co_flags & _ALL_FUTURE_FLAGS, + ) # The try/finally is important to avoid a potential reference cycle -- # any exception traceback will carry a reference to *our* frame, which # contains a reference to our local variables, which would otherwise @@ -245,37 +261,42 @@ def _namespace_ids(self): return [id(n) for n in self._namespaces] def __eq__(self, other): - return (isinstance(other, EvalEnvironment) - and self.flags == other.flags - and self._namespace_ids() == other._namespace_ids()) + return ( + isinstance(other, EvalEnvironment) + and self.flags == other.flags + and self._namespace_ids() == other._namespace_ids() + ) def __ne__(self, other): return not self == other def __hash__(self): - return hash((EvalEnvironment, - self.flags, - tuple(self._namespace_ids()))) + return hash((EvalEnvironment, self.flags, tuple(self._namespace_ids()))) __getstate__ = no_pickling -def _a(): # pragma: no cover + +def _a(): # pragma: no cover _a = 1 return _b() -def _b(): # pragma: no cover + +def _b(): # pragma: no cover _b = 1 return _c() -def _c(): # pragma: no cover + +def _c(): # pragma: no cover _c = 1 - return [EvalEnvironment.capture(), - EvalEnvironment.capture(0), - EvalEnvironment.capture(1), - EvalEnvironment.capture(0, reference=1), - EvalEnvironment.capture(2), - EvalEnvironment.capture(0, 2), - ] + return [ + EvalEnvironment.capture(), + EvalEnvironment.capture(0), + EvalEnvironment.capture(1), + EvalEnvironment.capture(0, reference=1), + EvalEnvironment.capture(2), + EvalEnvironment.capture(0, 2), + ] + def test_EvalEnvironment_capture_namespace(): c0, c, b1, b2, a1, a2 = _a() @@ -294,7 +315,8 @@ def test_EvalEnvironment_capture_namespace(): assert b1.namespace["_c"] is _c assert b2.namespace["_c"] is _c import pytest - pytest.raises(ValueError, EvalEnvironment.capture, 10 ** 6) + + pytest.raises(ValueError, EvalEnvironment.capture, 10**6) assert EvalEnvironment.capture(b1) is b1 @@ -302,24 +324,28 @@ def test_EvalEnvironment_capture_namespace(): assert_no_pickling(EvalEnvironment.capture()) + def test_EvalEnvironment_capture_flags(): # This is the only __future__ feature currently usable in Python # 3... fortunately it is probably not going anywhere. TEST_FEATURE = "barry_as_FLUFL" test_flag = getattr(__future__, TEST_FEATURE).compiler_flag assert test_flag & _ALL_FUTURE_FLAGS - source = ("def f():\n" - " in_f = 'hi from f'\n" - " global RETURN_INNER, RETURN_OUTER, RETURN_INNER_FROM_OUTER\n" - " RETURN_INNER = EvalEnvironment.capture(0)\n" - " RETURN_OUTER = call_capture_0()\n" - " RETURN_INNER_FROM_OUTER = call_capture_1()\n" - "f()\n") + source = ( + "def f():\n" + " in_f = 'hi from f'\n" + " global RETURN_INNER, RETURN_OUTER, RETURN_INNER_FROM_OUTER\n" + " RETURN_INNER = EvalEnvironment.capture(0)\n" + " RETURN_OUTER = call_capture_0()\n" + " RETURN_INNER_FROM_OUTER = call_capture_1()\n" + "f()\n" + ) code = compile(source, "", "exec", 0, 1) - env = {"EvalEnvironment": EvalEnvironment, - "call_capture_0": lambda: EvalEnvironment.capture(0), - "call_capture_1": lambda: EvalEnvironment.capture(1), - } + env = { + "EvalEnvironment": EvalEnvironment, + "call_capture_0": lambda: EvalEnvironment.capture(0), + "call_capture_1": lambda: EvalEnvironment.capture(1), + } env2 = dict(env) exec(code, env) assert env["RETURN_INNER"].namespace["in_f"] == "hi from f" @@ -329,9 +355,13 @@ def test_EvalEnvironment_capture_flags(): assert env["RETURN_OUTER"].flags & _ALL_FUTURE_FLAGS == 0 assert env["RETURN_INNER_FROM_OUTER"].flags & _ALL_FUTURE_FLAGS == 0 - code2 = compile(("from __future__ import %s\n" % (TEST_FEATURE,)) - + source, - "", "exec", 0, 1) + code2 = compile( + ("from __future__ import %s\n" % (TEST_FEATURE,)) + source, + "", + "exec", + 0, + 1, + ) exec(code2, env2) assert env2["RETURN_INNER"].namespace["in_f"] == "hi from f" assert env2["RETURN_INNER_FROM_OUTER"].namespace["in_f"] == "hi from f" @@ -340,11 +370,13 @@ def test_EvalEnvironment_capture_flags(): assert env2["RETURN_OUTER"].flags & _ALL_FUTURE_FLAGS == 0 assert env2["RETURN_INNER_FROM_OUTER"].flags & _ALL_FUTURE_FLAGS == test_flag + def test_EvalEnvironment_eval_namespace(): env = EvalEnvironment([{"a": 1}]) assert env.eval("2 * a") == 2 assert env.eval("2 * a", inner_namespace={"a": 2}) == 4 import pytest + pytest.raises(NameError, env.eval, "2 * b") a = 3 env2 = EvalEnvironment.capture(0) @@ -354,6 +386,7 @@ def test_EvalEnvironment_eval_namespace(): assert env3.eval("2 * a") == 2 assert env3.eval("2 * b") == 6 + def test_EvalEnvironment_eval_flags(): import pytest @@ -374,12 +407,14 @@ def test_EvalEnvironment_eval_flags(): assert env2.subset(["a"]).flags == test_flag assert env2.with_outer_namespace({"b": 10}).flags == test_flag + def test_EvalEnvironment_subset(): env = EvalEnvironment([{"a": 1}, {"b": 2}, {"c": 3}]) subset_a = env.subset(["a"]) assert subset_a.eval("a") == 1 import pytest + pytest.raises(NameError, subset_a.eval, "b") pytest.raises(NameError, subset_a.eval, "c") @@ -387,6 +422,7 @@ def test_EvalEnvironment_subset(): assert subset_bc.eval("b * c") == 6 pytest.raises(NameError, subset_bc.eval, "a") + def test_EvalEnvironment_eq(): # Two environments are eq only if they refer to exactly the same # global/local dicts @@ -399,6 +435,7 @@ def test_EvalEnvironment_eq(): env4 = capture_local_env() assert env3 != env4 + _builtins_dict = {} exec("from patsy.builtins import *", {}, _builtins_dict) # This is purely to make the existence of patsy.builtins visible to systems @@ -406,6 +443,7 @@ def test_EvalEnvironment_eq(): # that patsy.builtins will be present in sys.modules in any case. import patsy.builtins + class EvalFactor(object): def __init__(self, code, origin=None): """A factor class that executes arbitrary Python code and supports @@ -440,8 +478,7 @@ def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.code) def __eq__(self, other): - return (isinstance(other, EvalFactor) - and self.code == other.code) + return isinstance(other, EvalFactor) and self.code == other.code def __ne__(self, other): return not self == other @@ -456,13 +493,13 @@ def memorize_passes_needed(self, state, eval_env): eval_env = eval_env.with_outer_namespace(_builtins_dict) env_namespace = eval_env.namespace - subset_names = [name for name in ast_names(self.code) - if name in env_namespace] + subset_names = [name for name in ast_names(self.code) if name in env_namespace] eval_env = eval_env.subset(subset_names) state["eval_env"] = eval_env # example code: == "2 * center(x)" i = [0] + def new_name_maker(token): value = eval_env.namespace.get(token) if hasattr(value, "__patsy_stateful_transform__"): @@ -473,14 +510,17 @@ def new_name_maker(token): return obj_name + ".transform" else: return token + # example eval_code: == "2 * _patsy_stobj0__center__.transform(x)" eval_code = replace_bare_funcalls(self.code, new_name_maker) state["eval_code"] = eval_code # paranoia: verify that none of our new names appeared anywhere in the # original code if has_bare_variable_reference(state["transforms"], self.code): - raise PatsyError("names of this form are reserved for " - "internal use (%s)" % (token,), token.origin) + raise PatsyError( + "names of this form are reserved for " "internal use (%s)" % (token,), + token.origin, + ) # Pull out all the '_patsy_stobj0__center__.transform(x)' pieces # to make '_patsy_stobj0__center__.memorize_chunk(x)' pieces state["memorize_code"] = {} @@ -491,9 +531,11 @@ def new_name_maker(token): transform_call_name, transform_call_code = transform_call assert transform_call_name == obj_name + ".transform" assert transform_call_code.startswith(transform_call_name + "(") - memorize_code = (obj_name - + ".memorize_chunk" - + transform_call_code[len(transform_call_name):]) + memorize_code = ( + obj_name + + ".memorize_chunk" + + transform_call_code[len(transform_call_name) :] + ) state["memorize_code"][obj_name] = memorize_code # Then sort the codes into bins, so that every item in bin number i # depends only on items in bin (i-1) or less. (By 'depends', we mean @@ -529,29 +571,28 @@ def new_name_maker(token): def _eval(self, code, memorize_state, data): inner_namespace = VarLookupDict([data, memorize_state["transforms"]]) - return call_and_wrap_exc("Error evaluating factor", - self, - memorize_state["eval_env"].eval, - code, - inner_namespace=inner_namespace) + return call_and_wrap_exc( + "Error evaluating factor", + self, + memorize_state["eval_env"].eval, + code, + inner_namespace=inner_namespace, + ) def memorize_chunk(self, state, which_pass, data): for obj_name in state["pass_bins"][which_pass]: - self._eval(state["memorize_code"][obj_name], - state, - data) + self._eval(state["memorize_code"][obj_name], state, data) def memorize_finish(self, state, which_pass): for obj_name in state["pass_bins"][which_pass]: state["transforms"][obj_name].memorize_finish() def eval(self, memorize_state, data): - return self._eval(memorize_state["eval_code"], - memorize_state, - data) + return self._eval(memorize_state["eval_code"], memorize_state, data) __getstate__ = no_pickling + def test_EvalFactor_basics(): e = EvalFactor("a+b") assert e.code == "a + b" @@ -564,8 +605,10 @@ def test_EvalFactor_basics(): assert_no_pickling(e) + def test_EvalFactor_memorize_passes_needed(): from patsy.state import stateful_transform + foo = stateful_transform(lambda: "FOO-OBJ") bar = stateful_transform(lambda: "BAR-OBJ") quux = stateful_transform(lambda: "QUUX-OBJ") @@ -581,30 +624,30 @@ def test_EvalFactor_memorize_passes_needed(): assert state["eval_env"].namespace[name] is locals()[name] for name in ["w", "x", "y", "z", "e", "state"]: assert name not in state["eval_env"].namespace - assert state["transforms"] == {"_patsy_stobj0__foo__": "FOO-OBJ", - "_patsy_stobj1__bar__": "BAR-OBJ", - "_patsy_stobj2__foo__": "FOO-OBJ", - "_patsy_stobj3__quux__": "QUUX-OBJ"} - assert (state["eval_code"] - == "_patsy_stobj0__foo__.transform(x)" - " + _patsy_stobj1__bar__.transform(" - "_patsy_stobj2__foo__.transform(y))" - " + _patsy_stobj3__quux__.transform(z, w)") - - assert (state["memorize_code"] - == {"_patsy_stobj0__foo__": - "_patsy_stobj0__foo__.memorize_chunk(x)", - "_patsy_stobj1__bar__": - "_patsy_stobj1__bar__.memorize_chunk(_patsy_stobj2__foo__.transform(y))", - "_patsy_stobj2__foo__": - "_patsy_stobj2__foo__.memorize_chunk(y)", - "_patsy_stobj3__quux__": - "_patsy_stobj3__quux__.memorize_chunk(z, w)", - }) - assert state["pass_bins"] == [set(["_patsy_stobj0__foo__", - "_patsy_stobj2__foo__", - "_patsy_stobj3__quux__"]), - set(["_patsy_stobj1__bar__"])] + assert state["transforms"] == { + "_patsy_stobj0__foo__": "FOO-OBJ", + "_patsy_stobj1__bar__": "BAR-OBJ", + "_patsy_stobj2__foo__": "FOO-OBJ", + "_patsy_stobj3__quux__": "QUUX-OBJ", + } + assert ( + state["eval_code"] == "_patsy_stobj0__foo__.transform(x)" + " + _patsy_stobj1__bar__.transform(" + "_patsy_stobj2__foo__.transform(y))" + " + _patsy_stobj3__quux__.transform(z, w)" + ) + + assert state["memorize_code"] == { + "_patsy_stobj0__foo__": "_patsy_stobj0__foo__.memorize_chunk(x)", + "_patsy_stobj1__bar__": "_patsy_stobj1__bar__.memorize_chunk(_patsy_stobj2__foo__.transform(y))", + "_patsy_stobj2__foo__": "_patsy_stobj2__foo__.memorize_chunk(y)", + "_patsy_stobj3__quux__": "_patsy_stobj3__quux__.memorize_chunk(z, w)", + } + assert state["pass_bins"] == [ + set(["_patsy_stobj0__foo__", "_patsy_stobj2__foo__", "_patsy_stobj3__quux__"]), + set(["_patsy_stobj1__bar__"]), + ] + class _MockTransform(object): # Adds up all memorized data, then subtracts that sum from each datum @@ -616,6 +659,7 @@ def __init__(self): def memorize_chunk(self, data): self._memorize_chunk_called += 1 import numpy as np + self._sum += np.sum(data) def memorize_finish(self): @@ -624,8 +668,10 @@ def memorize_finish(self): def transform(self, data): return data - self._sum + def test_EvalFactor_end_to_end(): from patsy.state import stateful_transform + foo = stateful_transform(_MockTransform) e = EvalFactor("foo(x) + foo(foo(y))") state = {} @@ -638,13 +684,11 @@ def test_EvalFactor_end_to_end(): for name in ["x", "y", "e", "state"]: assert name not in state["eval_env"].namespace import numpy as np - e.memorize_chunk(state, 0, - {"x": np.array([1, 2]), - "y": np.array([10, 11])}) + + e.memorize_chunk(state, 0, {"x": np.array([1, 2]), "y": np.array([10, 11])}) assert state["transforms"]["_patsy_stobj0__foo__"]._memorize_chunk_called == 1 assert state["transforms"]["_patsy_stobj2__foo__"]._memorize_chunk_called == 1 - e.memorize_chunk(state, 0, {"x": np.array([12, -10]), - "y": np.array([100, 3])}) + e.memorize_chunk(state, 0, {"x": np.array([12, -10]), "y": np.array([100, 3])}) assert state["transforms"]["_patsy_stobj0__foo__"]._memorize_chunk_called == 2 assert state["transforms"]["_patsy_stobj2__foo__"]._memorize_chunk_called == 2 assert state["transforms"]["_patsy_stobj0__foo__"]._memorize_finish_called == 0 @@ -654,10 +698,8 @@ def test_EvalFactor_end_to_end(): assert state["transforms"]["_patsy_stobj2__foo__"]._memorize_finish_called == 1 assert state["transforms"]["_patsy_stobj1__foo__"]._memorize_chunk_called == 0 assert state["transforms"]["_patsy_stobj1__foo__"]._memorize_finish_called == 0 - e.memorize_chunk(state, 1, {"x": np.array([1, 2]), - "y": np.array([10, 11])}) - e.memorize_chunk(state, 1, {"x": np.array([12, -10]), - "y": np.array([100, 3])}) + e.memorize_chunk(state, 1, {"x": np.array([1, 2]), "y": np.array([10, 11])}) + e.memorize_chunk(state, 1, {"x": np.array([12, -10]), "y": np.array([100, 3])}) e.memorize_finish(state, 1) for transform in state["transforms"].values(): assert transform._memorize_chunk_called == 2 @@ -671,70 +713,78 @@ def test_EvalFactor_end_to_end(): # 2: -114, -113, -24, -121 # 1: 258, 259, 348, 251 # 0 + 1: 254, 256, 355, 236 - assert np.all(e.eval(state, - {"x": np.array([1, 2, 12, -10]), - "y": np.array([10, 11, 100, 3])}) - == [254, 256, 355, 236]) + assert np.all( + e.eval(state, {"x": np.array([1, 2, 12, -10]), "y": np.array([10, 11, 100, 3])}) + == [254, 256, 355, 236] + ) + def annotated_tokens(code): prev_was_dot = False it = PushbackAdapter(python_tokenize(code)) - for (token_type, token, origin) in it: + for token_type, token, origin in it: props = {} - props["bare_ref"] = (not prev_was_dot and token_type == tokenize.NAME) - props["bare_funcall"] = (props["bare_ref"] - and it.has_more() and it.peek()[1] == "(") + props["bare_ref"] = not prev_was_dot and token_type == tokenize.NAME + props["bare_funcall"] = ( + props["bare_ref"] and it.has_more() and it.peek()[1] == "(" + ) yield (token_type, token, origin, props) - prev_was_dot = (token == ".") + prev_was_dot = token == "." + def test_annotated_tokens(): - tokens_without_origins = [(token_type, token, props) - for (token_type, token, origin, props) - in (annotated_tokens("a(b) + c.d"))] - assert (tokens_without_origins - == [(tokenize.NAME, "a", {"bare_ref": True, "bare_funcall": True}), - (tokenize.OP, "(", {"bare_ref": False, "bare_funcall": False}), - (tokenize.NAME, "b", {"bare_ref": True, "bare_funcall": False}), - (tokenize.OP, ")", {"bare_ref": False, "bare_funcall": False}), - (tokenize.OP, "+", {"bare_ref": False, "bare_funcall": False}), - (tokenize.NAME, "c", {"bare_ref": True, "bare_funcall": False}), - (tokenize.OP, ".", {"bare_ref": False, "bare_funcall": False}), - (tokenize.NAME, "d", - {"bare_ref": False, "bare_funcall": False}), - ]) + tokens_without_origins = [ + (token_type, token, props) + for (token_type, token, origin, props) in (annotated_tokens("a(b) + c.d")) + ] + assert tokens_without_origins == [ + (tokenize.NAME, "a", {"bare_ref": True, "bare_funcall": True}), + (tokenize.OP, "(", {"bare_ref": False, "bare_funcall": False}), + (tokenize.NAME, "b", {"bare_ref": True, "bare_funcall": False}), + (tokenize.OP, ")", {"bare_ref": False, "bare_funcall": False}), + (tokenize.OP, "+", {"bare_ref": False, "bare_funcall": False}), + (tokenize.NAME, "c", {"bare_ref": True, "bare_funcall": False}), + (tokenize.OP, ".", {"bare_ref": False, "bare_funcall": False}), + (tokenize.NAME, "d", {"bare_ref": False, "bare_funcall": False}), + ] # This was a bug: assert len(list(annotated_tokens("x"))) == 1 + def has_bare_variable_reference(names, code): - for (_, token, _, props) in annotated_tokens(code): + for _, token, _, props in annotated_tokens(code): if props["bare_ref"] and token in names: return True return False + def replace_bare_funcalls(code, replacer): tokens = [] - for (token_type, token, origin, props) in annotated_tokens(code): + for token_type, token, origin, props in annotated_tokens(code): if props["bare_ref"] and props["bare_funcall"]: token = replacer(token) tokens.append((token_type, token)) return pretty_untokenize(tokens) + def test_replace_bare_funcalls(): def replacer1(token): return {"a": "b", "foo": "_internal.foo.process"}.get(token, token) + def t1(code, expected): replaced = replace_bare_funcalls(code, replacer1) print("%r -> %r" % (code, replaced)) print("(wanted %r)" % (expected,)) assert replaced == expected + t1("foobar()", "foobar()") t1("a()", "b()") t1("foobar.a()", "foobar.a()") t1("foo()", "_internal.foo.process()") t1("a + 1", "a + 1") - t1("b() + a() * x[foo(2 ** 3)]", - "b() + b() * x[_internal.foo.process(2 ** 3)]") + t1("b() + a() * x[foo(2 ** 3)]", "b() + b() * x[_internal.foo.process(2 ** 3)]") + class _FuncallCapturer(object): # captures the next funcall @@ -763,25 +813,33 @@ def add_token(self, token_type, token): if self.started and self.paren_depth == 0: self.done = True + # This is not a very general function -- it assumes that all references to the # given object are of the form '.something(method call)'. def capture_obj_method_calls(obj_name, code): capturers = [] - for (token_type, token, origin, props) in annotated_tokens(code): + for token_type, token, origin, props in annotated_tokens(code): for capturer in capturers: capturer.add_token(token_type, token) if props["bare_ref"] and token == obj_name: capturers.append(_FuncallCapturer(token_type, token)) - return [("".join(capturer.func), pretty_untokenize(capturer.tokens)) - for capturer in capturers] + return [ + ("".join(capturer.func), pretty_untokenize(capturer.tokens)) + for capturer in capturers + ] + def test_capture_obj_method_calls(): - assert (capture_obj_method_calls("foo", "a + foo.baz(bar) + b.c(d)") - == [("foo.baz", "foo.baz(bar)")]) - assert (capture_obj_method_calls("b", "a + foo.baz(bar) + b.c(d)") - == [("b.c", "b.c(d)")]) - assert (capture_obj_method_calls("foo", "foo.bar(foo.baz(quux))") - == [("foo.bar", "foo.bar(foo.baz(quux))"), - ("foo.baz", "foo.baz(quux)")]) - assert (capture_obj_method_calls("bar", "foo[bar.baz(x(z[asdf])) ** 2]") - == [("bar.baz", "bar.baz(x(z[asdf]))")]) + assert capture_obj_method_calls("foo", "a + foo.baz(bar) + b.c(d)") == [ + ("foo.baz", "foo.baz(bar)") + ] + assert capture_obj_method_calls("b", "a + foo.baz(bar) + b.c(d)") == [ + ("b.c", "b.c(d)") + ] + assert capture_obj_method_calls("foo", "foo.bar(foo.baz(quux))") == [ + ("foo.bar", "foo.bar(foo.baz(quux))"), + ("foo.baz", "foo.baz(quux)"), + ] + assert capture_obj_method_calls("bar", "foo[bar.baz(x(z[asdf])) ** 2]") == [ + ("bar.baz", "bar.baz(x(z[asdf]))") + ] diff --git a/patsy/highlevel.py b/patsy/highlevel.py index 2138367..43a000e 100644 --- a/patsy/highlevel.py +++ b/patsy/highlevel.py @@ -3,8 +3,7 @@ # See file LICENSE.txt for license information. # These are made available in the patsy.* namespace: -__all__ = ["dmatrix", "dmatrices", - "incr_dbuilder", "incr_dbuilders"] +__all__ = ["dmatrix", "dmatrices", "incr_dbuilder", "incr_dbuilders"] # problems: # statsmodels reluctant to pass around separate eval environment, suggesting @@ -19,46 +18,51 @@ from patsy.design_info import DesignMatrix, DesignInfo from patsy.eval import EvalEnvironment from patsy.desc import ModelDesc -from patsy.build import (design_matrix_builders, - build_design_matrices) -from patsy.util import (have_pandas, asarray_or_pandas, - atleast_2d_column_default) +from patsy.build import design_matrix_builders, build_design_matrices +from patsy.util import have_pandas, asarray_or_pandas, atleast_2d_column_default if have_pandas: import pandas + # Tries to build a (lhs, rhs) design given a formula_like and an incremental # data source. If formula_like is not capable of doing this, then returns # None. -def _try_incr_builders(formula_like, data_iter_maker, eval_env, - NA_action): +def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignInfo): - return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], - formula_like) - if (isinstance(formula_like, tuple) + return ( + design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], + formula_like, + ) + if ( + isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignInfo) - and isinstance(formula_like[1], DesignInfo)): + and isinstance(formula_like[1], DesignInfo) + ): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): - raise PatsyError("bad value from %r.__patsy_get_model_desc__" - % (formula_like,)) + raise PatsyError( + "bad value from %r.__patsy_get_model_desc__" % (formula_like,) + ) # fallthrough if isinstance(formula_like, str): formula_like = ModelDesc.from_formula(formula_like) # fallthrough if isinstance(formula_like, ModelDesc): assert isinstance(eval_env, EvalEnvironment) - return design_matrix_builders([formula_like.lhs_termlist, - formula_like.rhs_termlist], - data_iter_maker, - eval_env, - NA_action) + return design_matrix_builders( + [formula_like.lhs_termlist, formula_like.rhs_termlist], + data_iter_maker, + eval_env, + NA_action, + ) else: return None + def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): """Construct a design matrix builder incrementally from a large data set. @@ -96,17 +100,19 @@ def iter_maker(): The ``NA_action`` argument. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) - design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, - NA_action) + design_infos = _try_incr_builders( + formula_like, data_iter_maker, eval_env, NA_action + ) if design_infos is None: raise PatsyError("bad formula-like object") if len(design_infos[0].column_names) > 0: - raise PatsyError("encountered outcome variables for a model " - "that does not expect them") + raise PatsyError( + "encountered outcome variables for a model " "that does not expect them" + ) return design_infos[1] -def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, - NA_action="drop"): + +def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): """Construct two design matrix builders incrementally from a large data set. @@ -114,14 +120,16 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, to :func:`dmatrix`. See :func:`incr_dbuilder` for details. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) - design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, - NA_action) + design_infos = _try_incr_builders( + formula_like, data_iter_maker, eval_env, NA_action + ) if design_infos is None: raise PatsyError("bad formula-like object") if len(design_infos[0].column_names) == 0: raise PatsyError("model is missing required outcome variables") return design_infos + # This always returns a length-two tuple, # response, predictors # where @@ -139,34 +147,41 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, # DesignInfo # (DesignInfo, DesignInfo) # any object with a special method __patsy_get_model_desc__ -def _do_highlevel_design(formula_like, data, eval_env, - NA_action, return_type): +def _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type): if return_type == "dataframe" and not have_pandas: - raise PatsyError("pandas.DataFrame was requested, but pandas " - "is not installed") + raise PatsyError( + "pandas.DataFrame was requested, but pandas " "is not installed" + ) if return_type not in ("matrix", "dataframe"): - raise PatsyError("unrecognized output type %r, should be " - "'matrix' or 'dataframe'" % (return_type,)) + raise PatsyError( + "unrecognized output type %r, should be " + "'matrix' or 'dataframe'" % (return_type,) + ) + def data_iter_maker(): return iter([data]) - design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, - NA_action) + + design_infos = _try_incr_builders( + formula_like, data_iter_maker, eval_env, NA_action + ) if design_infos is not None: - return build_design_matrices(design_infos, data, - NA_action=NA_action, - return_type=return_type) + return build_design_matrices( + design_infos, data, NA_action=NA_action, return_type=return_type + ) else: # No builders, but maybe we can still get matrices if isinstance(formula_like, tuple): if len(formula_like) != 2: - raise PatsyError("don't know what to do with a length %s " - "matrices tuple" - % (len(formula_like),)) + raise PatsyError( + "don't know what to do with a length %s " + "matrices tuple" % (len(formula_like),) + ) (lhs, rhs) = formula_like else: # subok=True is necessary here to allow DesignMatrixes to pass # through (lhs, rhs) = (None, asarray_or_pandas(formula_like, subok=True)) + # some sort of explicit matrix or matrices were given. Currently we # have them in one of these forms: # -- an ndarray or subclass @@ -188,6 +203,7 @@ def _regularize_matrix(m, default_column_prefix): return (m, orig_index) else: return (DesignMatrix(m, di), orig_index) + rhs, rhs_orig_index = _regularize_matrix(rhs, "x") if lhs is None: lhs = np.zeros((rhs.shape[0], 0), dtype=float) @@ -196,13 +212,15 @@ def _regularize_matrix(m, default_column_prefix): assert isinstance(getattr(lhs, "design_info", None), DesignInfo) assert isinstance(getattr(rhs, "design_info", None), DesignInfo) if lhs.shape[0] != rhs.shape[0]: - raise PatsyError("shape mismatch: outcome matrix has %s rows, " - "predictor matrix has %s rows" - % (lhs.shape[0], rhs.shape[0])) + raise PatsyError( + "shape mismatch: outcome matrix has %s rows, " + "predictor matrix has %s rows" % (lhs.shape[0], rhs.shape[0]) + ) if rhs_orig_index is not None and lhs_orig_index is not None: if not rhs_orig_index.equals(lhs_orig_index): - raise PatsyError("index mismatch: outcome and " - "predictor have incompatible indexes") + raise PatsyError( + "index mismatch: outcome and " "predictor have incompatible indexes" + ) if return_type == "dataframe": if rhs_orig_index is not None and lhs_orig_index is None: lhs.index = rhs.index @@ -210,8 +228,8 @@ def _regularize_matrix(m, default_column_prefix): rhs.index = lhs.index return (lhs, rhs) -def dmatrix(formula_like, data={}, eval_env=0, - NA_action="drop", return_type="matrix"): + +def dmatrix(formula_like, data={}, eval_env=0, NA_action="drop", return_type="matrix"): """Construct a single design matrix given a formula_like and data. :arg formula_like: An object that can be used to construct a design @@ -275,15 +293,19 @@ def dmatrix(formula_like, data={}, eval_env=0, The ``NA_action`` argument. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) - (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, - NA_action, return_type) + (lhs, rhs) = _do_highlevel_design( + formula_like, data, eval_env, NA_action, return_type + ) if lhs.shape[1] != 0: - raise PatsyError("encountered outcome variables for a model " - "that does not expect them") + raise PatsyError( + "encountered outcome variables for a model " "that does not expect them" + ) return rhs -def dmatrices(formula_like, data={}, eval_env=0, - NA_action="drop", return_type="matrix"): + +def dmatrices( + formula_like, data={}, eval_env=0, NA_action="drop", return_type="matrix" +): """Construct two design matrices given a formula_like and data. This function is identical to :func:`dmatrix`, except that it requires @@ -294,8 +316,9 @@ def dmatrices(formula_like, data={}, eval_env=0, See :func:`dmatrix` for details. """ eval_env = EvalEnvironment.capture(eval_env, reference=1) - (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, - NA_action, return_type) + (lhs, rhs) = _do_highlevel_design( + formula_like, data, eval_env, NA_action, return_type + ) if lhs.shape[1] == 0: raise PatsyError("model is missing required outcome variables") return (lhs, rhs) diff --git a/patsy/infix_parser.py b/patsy/infix_parser.py index fb0ddff..6c127b5 100644 --- a/patsy/infix_parser.py +++ b/patsy/infix_parser.py @@ -32,8 +32,13 @@ from patsy import PatsyError from patsy.origin import Origin -from patsy.util import (repr_pretty_delegate, repr_pretty_impl, - no_pickling, assert_no_pickling) +from patsy.util import ( + repr_pretty_delegate, + repr_pretty_impl, + no_pickling, + assert_no_pickling, +) + class _UniqueValue: def __init__(self, print_as): @@ -44,6 +49,7 @@ def __repr__(self): __getstate__ = no_pickling + class Token: """A token with possible payload. @@ -52,6 +58,7 @@ class Token: An arbitrary object indicating the type of this token. Should be :term:`hashable`, but otherwise it can be whatever you like. """ + LPAREN = _UniqueValue("LPAREN") RPAREN = _UniqueValue("RPAREN") @@ -61,6 +68,7 @@ def __init__(self, type, origin, extra=None): self.extra = extra __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): assert not cycle kwargs = [] @@ -70,6 +78,7 @@ def _repr_pretty_(self, p, cycle): __getstate__ = no_pickling + class ParseNode(object): def __init__(self, type, token, args, origin): self.type = type @@ -78,11 +87,13 @@ def __init__(self, type, token, args, origin): self.origin = origin __repr__ = repr_pretty_delegate + def _repr_pretty_(self, p, cycle): return repr_pretty_impl(p, self, [self.type, self.token, self.args]) __getstate__ = no_pickling + class Operator(object): def __init__(self, token_type, arity, precedence): self.token_type = token_type @@ -90,11 +101,16 @@ def __init__(self, token_type, arity, precedence): self.precedence = precedence def __repr__(self): - return "%s(%r, %r, %r)" % (self.__class__.__name__, - self.token_type, self.arity, self.precedence) + return "%s(%r, %r, %r)" % ( + self.__class__.__name__, + self.token_type, + self.arity, + self.precedence, + ) __getstate__ = no_pickling + class _StackOperator(object): def __init__(self, op, token): self.op = op @@ -102,8 +118,10 @@ def __init__(self, op, token): __getstate__ = no_pickling + _open_paren = Operator(Token.LPAREN, -1, -9999999) + class _ParseContext(object): def __init__(self, unary_ops, binary_ops, atomic_types, trace): self.op_stack = [] @@ -115,6 +133,7 @@ def __init__(self, unary_ops, binary_ops, atomic_types, trace): __getstate__ = no_pickling + def _read_noun_context(token, c): if token.type == Token.LPAREN: if c.trace: @@ -129,13 +148,13 @@ def _read_noun_context(token, c): elif token.type in c.atomic_types: if c.trace: print("Pushing noun %r (%r)" % (token.type, token.extra)) - c.noun_stack.append(ParseNode(token.type, token, [], - token.origin)) + c.noun_stack.append(ParseNode(token.type, token, [], token.origin)) return False else: - raise PatsyError("expected a noun, not '%s'" - % (token.origin.relevant_code(),), - token) + raise PatsyError( + "expected a noun, not '%s'" % (token.origin.relevant_code(),), token + ) + def _run_op(c): assert c.op_stack @@ -146,10 +165,15 @@ def _run_op(c): args.reverse() if c.trace: print("Reducing %r (%r)" % (stackop.op.token_type, args)) - node = ParseNode(stackop.op.token_type, stackop.token, args, - Origin.combine([stackop.token] + args)) + node = ParseNode( + stackop.op.token_type, + stackop.token, + args, + Origin.combine([stackop.token] + args), + ) c.noun_stack.append(node) + def _read_op_context(token, c): if token.type == Token.RPAREN: if c.trace: @@ -161,9 +185,7 @@ def _read_op_context(token, c): assert c.op_stack[-1].op.token_type == Token.LPAREN # Expand the origin of the item on top of the noun stack to include # the open and close parens: - combined = Origin.combine([c.op_stack[-1].token, - c.noun_stack[-1].token, - token]) + combined = Origin.combine([c.op_stack[-1].token, c.noun_stack[-1].token, token]) c.noun_stack[-1].origin = combined # Pop the open-paren c.op_stack.pop() @@ -172,17 +194,17 @@ def _read_op_context(token, c): if c.trace: print("Found binary operator %r" % (token.type)) stackop = _StackOperator(c.binary_ops[token.type], token) - while (c.op_stack - and stackop.op.precedence <= c.op_stack[-1].op.precedence): + while c.op_stack and stackop.op.precedence <= c.op_stack[-1].op.precedence: _run_op(c) if c.trace: print("Pushing binary operator %r" % (token.type)) c.op_stack.append(stackop) return True else: - raise PatsyError("expected an operator, not '%s'" - % (token.origin.relevant_code(),), - token) + raise PatsyError( + "expected an operator, not '%s'" % (token.origin.relevant_code(),), token + ) + def infix_parse(tokens, operators, atomic_types, trace=False): token_source = iter(tokens) @@ -216,8 +238,10 @@ def infix_parse(tokens, operators, atomic_types, trace=False): print("End of token stream") if want_noun: - raise PatsyError("expected a noun, but instead the expression ended", - c.op_stack[-1].token.origin) + raise PatsyError( + "expected a noun, but instead the expression ended", + c.op_stack[-1].token.origin, + ) while c.op_stack: if c.op_stack[-1].op.token_type == Token.LPAREN: @@ -227,28 +251,31 @@ def infix_parse(tokens, operators, atomic_types, trace=False): assert len(c.noun_stack) == 1 return c.noun_stack.pop() + # Much more thorough tests in parse_formula.py, this is just a smoke test: def test_infix_parse(): - ops = [Operator("+", 2, 10), - Operator("*", 2, 20), - Operator("-", 1, 30)] + ops = [Operator("+", 2, 10), Operator("*", 2, 20), Operator("-", 1, 30)] atomic = ["ATOM1", "ATOM2"] # a + -b * (c + d) mock_origin = Origin("asdf", 2, 3) - tokens = [Token("ATOM1", mock_origin, "a"), - Token("+", mock_origin, "+"), - Token("-", mock_origin, "-"), - Token("ATOM2", mock_origin, "b"), - Token("*", mock_origin, "*"), - Token(Token.LPAREN, mock_origin, "("), - Token("ATOM1", mock_origin, "c"), - Token("+", mock_origin, "+"), - Token("ATOM2", mock_origin, "d"), - Token(Token.RPAREN, mock_origin, ")")] + tokens = [ + Token("ATOM1", mock_origin, "a"), + Token("+", mock_origin, "+"), + Token("-", mock_origin, "-"), + Token("ATOM2", mock_origin, "b"), + Token("*", mock_origin, "*"), + Token(Token.LPAREN, mock_origin, "("), + Token("ATOM1", mock_origin, "c"), + Token("+", mock_origin, "+"), + Token("ATOM2", mock_origin, "d"), + Token(Token.RPAREN, mock_origin, ")"), + ] tree = infix_parse(tokens, ops, atomic) + def te(tree, type, extra): assert tree.type == type assert tree.token.extra == extra + te(tree, "+", "+") te(tree.args[0], "ATOM1", "a") assert tree.args[0].args == [] @@ -261,9 +288,9 @@ def te(tree, type, extra): te(tree.args[1].args[1].args[1], "ATOM2", "d") import pytest + # No ternary ops - pytest.raises(ValueError, - infix_parse, [], [Operator("+", 3, 10)], ["ATOMIC"]) + pytest.raises(ValueError, infix_parse, [], [Operator("+", 3, 10)], ["ATOMIC"]) # smoke test just to make sure there are no egregious bugs in 'trace' infix_parse(tokens, ops, atomic, trace=True) diff --git a/patsy/mgcv_cubic_splines.py b/patsy/mgcv_cubic_splines.py index 3aeb5eb..7acaa38 100644 --- a/patsy/mgcv_cubic_splines.py +++ b/patsy/mgcv_cubic_splines.py @@ -9,8 +9,13 @@ import numpy as np -from patsy.util import (have_pandas, atleast_2d_column_default, - no_pickling, assert_no_pickling, safe_string_eq) +from patsy.util import ( + have_pandas, + atleast_2d_column_default, + no_pickling, + assert_no_pickling, + safe_string_eq, +) from patsy.state import stateful_transform if have_pandas: @@ -32,18 +37,18 @@ def _get_natural_f(knots): """ try: from scipy import linalg - except ImportError: # pragma: no cover + except ImportError: # pragma: no cover raise ImportError("Cubic spline functionality requires scipy.") h = knots[1:] - knots[:-1] - diag = (h[:-1] + h[1:]) / 3. - ul_diag = h[1:-1] / 6. - banded_b = np.array([np.r_[0., ul_diag], diag, np.r_[ul_diag, 0.]]) + diag = (h[:-1] + h[1:]) / 3.0 + ul_diag = h[1:-1] / 6.0 + banded_b = np.array([np.r_[0.0, ul_diag], diag, np.r_[ul_diag, 0.0]]) d = np.zeros((knots.size - 2, knots.size)) for i in range(knots.size - 2): - d[i, i] = 1. / h[i] - d[i, i + 2] = 1. / h[i + 1] - d[i, i + 1] = - d[i, i] - d[i, i + 2] + d[i, i] = 1.0 / h[i] + d[i, i + 2] = 1.0 / h[i + 1] + d[i, i + 1] = -d[i, i] - d[i, i + 2] fm = linalg.solve_banded((1, 1), banded_b, d) @@ -64,9 +69,10 @@ def _map_cyclic(x, lbound, ubound): :raise ValueError: if lbound >= ubound. """ if lbound >= ubound: - raise ValueError("Invalid argument: lbound (%r) should be " - "less than ubound (%r)." - % (lbound, ubound)) + raise ValueError( + "Invalid argument: lbound (%r) should be " + "less than ubound (%r)." % (lbound, ubound) + ) x = np.copy(x) x[x > ubound] = lbound + (x[x > ubound] - ubound) % (ubound - lbound) @@ -86,6 +92,7 @@ def test__map_cyclic(): def test__map_cyclic_errors(): import pytest + x = np.linspace(0.2, 5.7, 10) pytest.raises(ValueError, _map_cyclic, x, 4.5, 3.6) pytest.raises(ValueError, _map_cyclic, x, 4.5, 4.5) @@ -106,22 +113,22 @@ def _get_cyclic_f(knots): b = np.zeros((n, n)) d = np.zeros((n, n)) - b[0, 0] = (h[n - 1] + h[0]) / 3. - b[0, n - 1] = h[n - 1] / 6. - b[n - 1, 0] = h[n - 1] / 6. + b[0, 0] = (h[n - 1] + h[0]) / 3.0 + b[0, n - 1] = h[n - 1] / 6.0 + b[n - 1, 0] = h[n - 1] / 6.0 - d[0, 0] = -1. / h[0] - 1. / h[n - 1] - d[0, n - 1] = 1. / h[n - 1] - d[n - 1, 0] = 1. / h[n - 1] + d[0, 0] = -1.0 / h[0] - 1.0 / h[n - 1] + d[0, n - 1] = 1.0 / h[n - 1] + d[n - 1, 0] = 1.0 / h[n - 1] for i in range(1, n): - b[i, i] = (h[i - 1] + h[i]) / 3. - b[i, i - 1] = h[i - 1] / 6. - b[i - 1, i] = h[i - 1] / 6. + b[i, i] = (h[i - 1] + h[i]) / 3.0 + b[i, i - 1] = h[i - 1] / 6.0 + b[i - 1, i] = h[i - 1] / 6.0 - d[i, i] = -1. / h[i - 1] - 1. / h[i] - d[i, i - 1] = 1. / h[i - 1] - d[i - 1, i] = 1. / h[i - 1] + d[i, i] = -1.0 / h[i - 1] - 1.0 / h[i] + d[i, i - 1] = 1.0 / h[i - 1] + d[i - 1, i] = 1.0 / h[i - 1] return np.linalg.solve(b, d) @@ -153,14 +160,15 @@ def _row_tensor_product(dms): tp_ncols = 1 for dm in dms: if dm.shape[0] != tp_nrows: - raise ValueError("Tensor product arguments should have " - "same number of rows.") + raise ValueError( + "Tensor product arguments should have " "same number of rows." + ) tp_ncols *= dm.shape[1] tp = np.zeros((tp_nrows, tp_ncols)) - tp[:, -dms[-1].shape[1]:] = dms[-1] + tp[:, -dms[-1].shape[1] :] = dms[-1] filled_tp_ncols = dms[-1].shape[1] for dm in dms[-2::-1]: - p = - filled_tp_ncols * dm.shape[1] + p = -filled_tp_ncols * dm.shape[1] for j in range(dm.shape[1]): xj = dm[:, j] for t in range(-filled_tp_ncols, 0): @@ -173,13 +181,15 @@ def _row_tensor_product(dms): def test__row_tensor_product_errors(): import pytest + pytest.raises(ValueError, _row_tensor_product, []) pytest.raises(ValueError, _row_tensor_product, [np.arange(1, 5)]) - pytest.raises(ValueError, _row_tensor_product, - [np.arange(1, 5), np.arange(1, 5)]) - pytest.raises(ValueError, _row_tensor_product, - [np.arange(1, 13).reshape((3, 4)), - np.arange(1, 13).reshape((4, 3))]) + pytest.raises(ValueError, _row_tensor_product, [np.arange(1, 5), np.arange(1, 5)]) + pytest.raises( + ValueError, + _row_tensor_product, + [np.arange(1, 13).reshape((3, 4)), np.arange(1, 13).reshape((4, 3))], + ) def test__row_tensor_product(): @@ -202,12 +212,10 @@ def test__row_tensor_product(): # Testing main cases dm2 = np.array([[1, 2], [1, 2]]) dm3 = np.arange(1, 7).reshape((2, 3)) - expected_tp5 = np.array([[1, 2, 3, 2, 4, 6], - [4, 5, 6, 8, 10, 12]]) + expected_tp5 = np.array([[1, 2, 3, 2, 4, 6], [4, 5, 6, 8, 10, 12]]) tp5 = _row_tensor_product([dm2, dm3]) assert np.array_equal(tp5, expected_tp5) - expected_tp6 = np.array([[1, 2, 2, 4, 3, 6], - [4, 8, 5, 10, 6, 12]]) + expected_tp6 = np.array([[1, 2, 2, 4, 3, 6], [4, 8, 5, 10, 6, 12]]) tp6 = _row_tensor_product([dm3, dm2]) assert np.array_equal(tp6, expected_tp6) @@ -266,14 +274,14 @@ def _compute_base_functions(x, knots): ajm = xj1_x / hj ajp = x_xj / hj - cjm_3 = xj1_x * xj1_x * xj1_x / (6. * hj) - cjm_3[x > np.max(knots)] = 0. - cjm_1 = hj * xj1_x / 6. + cjm_3 = xj1_x * xj1_x * xj1_x / (6.0 * hj) + cjm_3[x > np.max(knots)] = 0.0 + cjm_1 = hj * xj1_x / 6.0 cjm = cjm_3 - cjm_1 - cjp_3 = x_xj * x_xj * x_xj / (6. * hj) - cjp_3[x < np.min(knots)] = 0. - cjp_1 = hj * x_xj / 6. + cjp_3 = x_xj * x_xj * x_xj / (6.0 * hj) + cjp_3[x < np.min(knots)] = 0.0 + cjp_1 = hj * x_xj / 6.0 cjp = cjp_3 - cjp_1 return ajm, ajp, cjm, cjp, j @@ -293,7 +301,7 @@ def _absorb_constraints(design_matrix, constraints): """ try: from scipy import linalg - except ImportError: # pragma: no cover + except ImportError: # pragma: no cover raise ImportError("Cubic spline functionality requires scipy.") m = constraints.shape[0] @@ -338,8 +346,7 @@ def _get_free_crs_dmatrix(x, knots, cyclic=False): else: f = _get_natural_f(knots) - dmt = ajm * i[j, :].T + ajp * i[j1, :].T + \ - cjm * f[j, :].T + cjp * f[j1, :].T + dmt = ajm * i[j, :].T + ajp * i[j1, :].T + cjm * f[j, :].T + cjp * f[j1, :].T return dmt.T @@ -387,8 +394,9 @@ def _get_te_dmatrix(design_matrices, constraints=None): # Stateful Transforms -def _get_all_sorted_knots(x, n_inner_knots=None, inner_knots=None, - lower_bound=None, upper_bound=None): +def _get_all_sorted_knots( + x, n_inner_knots=None, inner_knots=None, lower_bound=None, upper_bound=None +): """Gets all knots locations with lower and upper exterior knots included. If needed, inner knots are computed as equally spaced quantiles of the @@ -407,25 +415,31 @@ def _get_all_sorted_knots(x, n_inner_knots=None, inner_knots=None, compute ``n_inner_knots + 2`` distinct knots. """ if lower_bound is None and x.size == 0: - raise ValueError("Cannot set lower exterior knot location: empty " - "input data and lower_bound not specified.") + raise ValueError( + "Cannot set lower exterior knot location: empty " + "input data and lower_bound not specified." + ) elif lower_bound is None and x.size != 0: lower_bound = np.min(x) if upper_bound is None and x.size == 0: - raise ValueError("Cannot set upper exterior knot location: empty " - "input data and upper_bound not specified.") + raise ValueError( + "Cannot set upper exterior knot location: empty " + "input data and upper_bound not specified." + ) elif upper_bound is None and x.size != 0: upper_bound = np.max(x) if upper_bound < lower_bound: - raise ValueError("lower_bound > upper_bound (%r > %r)" - % (lower_bound, upper_bound)) + raise ValueError( + "lower_bound > upper_bound (%r > %r)" % (lower_bound, upper_bound) + ) if inner_knots is None and n_inner_knots is not None: if n_inner_knots < 0: - raise ValueError("Invalid requested number of inner knots: %r" - % (n_inner_knots,)) + raise ValueError( + "Invalid requested number of inner knots: %r" % (n_inner_knots,) + ) x = x[(lower_bound <= x) & (x <= upper_bound)] x = np.unique(x) @@ -437,97 +451,94 @@ def _get_all_sorted_knots(x, n_inner_knots=None, inner_knots=None, elif n_inner_knots == 0: inner_knots = np.array([]) else: - raise ValueError("No data values between lower_bound(=%r) and " - "upper_bound(=%r): cannot compute requested " - "%r inner knot(s)." - % (lower_bound, upper_bound, n_inner_knots)) + raise ValueError( + "No data values between lower_bound(=%r) and " + "upper_bound(=%r): cannot compute requested " + "%r inner knot(s)." % (lower_bound, upper_bound, n_inner_knots) + ) elif inner_knots is not None: inner_knots = np.unique(inner_knots) if n_inner_knots is not None and n_inner_knots != inner_knots.size: - raise ValueError("Needed number of inner knots=%r does not match " - "provided number of inner knots=%r." - % (n_inner_knots, inner_knots.size)) + raise ValueError( + "Needed number of inner knots=%r does not match " + "provided number of inner knots=%r." % (n_inner_knots, inner_knots.size) + ) n_inner_knots = inner_knots.size if np.any(inner_knots < lower_bound): - raise ValueError("Some knot values (%s) fall below lower bound " - "(%r)." - % (inner_knots[inner_knots < lower_bound], - lower_bound)) + raise ValueError( + "Some knot values (%s) fall below lower bound " + "(%r)." % (inner_knots[inner_knots < lower_bound], lower_bound) + ) if np.any(inner_knots > upper_bound): - raise ValueError("Some knot values (%s) fall above upper bound " - "(%r)." - % (inner_knots[inner_knots > upper_bound], - upper_bound)) + raise ValueError( + "Some knot values (%s) fall above upper bound " + "(%r)." % (inner_knots[inner_knots > upper_bound], upper_bound) + ) else: raise ValueError("Must specify either 'n_inner_knots' or 'inner_knots'.") all_knots = np.concatenate(([lower_bound, upper_bound], inner_knots)) all_knots = np.unique(all_knots) if all_knots.size != n_inner_knots + 2: - raise ValueError("Unable to compute n_inner_knots(=%r) + 2 distinct " - "knots: %r data value(s) found between " - "lower_bound(=%r) and upper_bound(=%r)." - % (n_inner_knots, x.size, lower_bound, upper_bound)) + raise ValueError( + "Unable to compute n_inner_knots(=%r) + 2 distinct " + "knots: %r data value(s) found between " + "lower_bound(=%r) and upper_bound(=%r)." + % (n_inner_knots, x.size, lower_bound, upper_bound) + ) return all_knots def test__get_all_sorted_knots(): import pytest - pytest.raises(ValueError, _get_all_sorted_knots, - np.array([]), -1) - pytest.raises(ValueError, _get_all_sorted_knots, - np.array([]), 0) - pytest.raises(ValueError, _get_all_sorted_knots, - np.array([]), 0, lower_bound=1) - pytest.raises(ValueError, _get_all_sorted_knots, - np.array([]), 0, upper_bound=5) - pytest.raises(ValueError, _get_all_sorted_knots, - np.array([]), 0, lower_bound=3, upper_bound=1) + + pytest.raises(ValueError, _get_all_sorted_knots, np.array([]), -1) + pytest.raises(ValueError, _get_all_sorted_knots, np.array([]), 0) + pytest.raises(ValueError, _get_all_sorted_knots, np.array([]), 0, lower_bound=1) + pytest.raises(ValueError, _get_all_sorted_knots, np.array([]), 0, upper_bound=5) + pytest.raises( + ValueError, _get_all_sorted_knots, np.array([]), 0, lower_bound=3, upper_bound=1 + ) assert np.array_equal( - _get_all_sorted_knots(np.array([]), 0, lower_bound=1, upper_bound=5), - [1, 5]) - pytest.raises(ValueError, _get_all_sorted_knots, - np.array([]), 0, lower_bound=1, upper_bound=1) + _get_all_sorted_knots(np.array([]), 0, lower_bound=1, upper_bound=5), [1, 5] + ) + pytest.raises( + ValueError, _get_all_sorted_knots, np.array([]), 0, lower_bound=1, upper_bound=1 + ) x = np.arange(6) * 2 - pytest.raises(ValueError, _get_all_sorted_knots, - x, -2) + pytest.raises(ValueError, _get_all_sorted_knots, x, -2) + assert np.array_equal(_get_all_sorted_knots(x, 0), [0, 10]) assert np.array_equal( - _get_all_sorted_knots(x, 0), - [0, 10]) + _get_all_sorted_knots(x, 0, lower_bound=3, upper_bound=8), [3, 8] + ) assert np.array_equal( - _get_all_sorted_knots(x, 0, lower_bound=3, upper_bound=8), - [3, 8]) + _get_all_sorted_knots(x, 2, lower_bound=1, upper_bound=9), [1, 4, 6, 9] + ) + pytest.raises(ValueError, _get_all_sorted_knots, x, 2, lower_bound=1, upper_bound=3) + pytest.raises( + ValueError, _get_all_sorted_knots, x, 1, lower_bound=1.3, upper_bound=1.4 + ) assert np.array_equal( - _get_all_sorted_knots(x, 2, lower_bound=1, upper_bound=9), - [1, 4, 6, 9]) - pytest.raises(ValueError, _get_all_sorted_knots, - x, 2, lower_bound=1, upper_bound=3) - pytest.raises(ValueError, _get_all_sorted_knots, - x, 1, lower_bound=1.3, upper_bound=1.4) + _get_all_sorted_knots(x, 1, lower_bound=1, upper_bound=3), [1, 2, 3] + ) + pytest.raises(ValueError, _get_all_sorted_knots, x, 1, lower_bound=2, upper_bound=3) + pytest.raises(ValueError, _get_all_sorted_knots, x, 1, inner_knots=[2, 3]) + pytest.raises(ValueError, _get_all_sorted_knots, x, lower_bound=2, upper_bound=3) + assert np.array_equal(_get_all_sorted_knots(x, inner_knots=[3, 7]), [0, 3, 7, 10]) assert np.array_equal( - _get_all_sorted_knots(x, 1, lower_bound=1, upper_bound=3), - [1, 2, 3]) - pytest.raises(ValueError, _get_all_sorted_knots, - x, 1, lower_bound=2, upper_bound=3) - pytest.raises(ValueError, _get_all_sorted_knots, - x, 1, inner_knots=[2, 3]) - pytest.raises(ValueError, _get_all_sorted_knots, - x, lower_bound=2, upper_bound=3) - assert np.array_equal( - _get_all_sorted_knots(x, inner_knots=[3, 7]), - [0, 3, 7, 10]) - assert np.array_equal( - _get_all_sorted_knots(x, inner_knots=[3, 7], lower_bound=2), - [2, 3, 7, 10]) - pytest.raises(ValueError, _get_all_sorted_knots, - x, inner_knots=[3, 7], lower_bound=4) - pytest.raises(ValueError, _get_all_sorted_knots, - x, inner_knots=[3, 7], upper_bound=6) + _get_all_sorted_knots(x, inner_knots=[3, 7], lower_bound=2), [2, 3, 7, 10] + ) + pytest.raises( + ValueError, _get_all_sorted_knots, x, inner_knots=[3, 7], lower_bound=4 + ) + pytest.raises( + ValueError, _get_all_sorted_knots, x, inner_knots=[3, 7], upper_bound=6 + ) def _get_centering_constraint_from_dmatrix(design_matrix): - """ Computes the centering constraint from the given design matrix. + """Computes the centering constraint from the given design matrix. We want to ensure that if ``b`` is the array of parameters, our model is centered, ie ``np.mean(np.dot(design_matrix, b))`` is zero. @@ -551,6 +562,7 @@ class CubicRegressionSpline(object): - ``cc(x, df=None, knots=None, lower_bound=None, upper_bound=None, constraints=None)`` for cyclic cubic regression spline """ + common_doc = """ :arg df: The number of degrees of freedom to use for this spline. The return value will have this many columns. You must specify at least one @@ -589,24 +601,31 @@ def __init__(self, name, cyclic): self._all_knots = None self._constraints = None - def memorize_chunk(self, x, df=None, knots=None, - lower_bound=None, upper_bound=None, - constraints=None): - args = {"df": df, - "knots": knots, - "lower_bound": lower_bound, - "upper_bound": upper_bound, - "constraints": constraints, - } + def memorize_chunk( + self, + x, + df=None, + knots=None, + lower_bound=None, + upper_bound=None, + constraints=None, + ): + args = { + "df": df, + "knots": knots, + "lower_bound": lower_bound, + "upper_bound": upper_bound, + "constraints": constraints, + } self._tmp["args"] = args x = np.atleast_1d(x) if x.ndim == 2 and x.shape[1] == 1: x = x[:, 0] if x.ndim > 1: - raise ValueError("Input to %r must be 1-d, " - "or a 2-d column vector." - % (self._name,)) + raise ValueError( + "Input to %r must be 1-d, " "or a 2-d column vector." % (self._name,) + ) self._tmp.setdefault("xs", []).append(x) @@ -630,8 +649,7 @@ def memorize_finish(self): else: constraints = np.atleast_2d(constraints) if constraints.ndim != 2: - raise ValueError("Constraints must be 2-d array or " - "1-d vector.") + raise ValueError("Constraints must be 2-d array or " "1-d vector.") n_constraints = constraints.shape[0] n_inner_knots = None @@ -640,16 +658,20 @@ def memorize_finish(self): if not self._cyclic and n_constraints == 0: min_df = 2 if args["df"] < min_df: - raise ValueError("'df'=%r must be greater than or equal to %r." - % (args["df"], min_df)) + raise ValueError( + "'df'=%r must be greater than or equal to %r." + % (args["df"], min_df) + ) n_inner_knots = args["df"] - 2 + n_constraints if self._cyclic: n_inner_knots += 1 - self._all_knots = _get_all_sorted_knots(x, - n_inner_knots=n_inner_knots, - inner_knots=args["knots"], - lower_bound=args["lower_bound"], - upper_bound=args["upper_bound"]) + self._all_knots = _get_all_sorted_knots( + x, + n_inner_knots=n_inner_knots, + inner_knots=args["knots"], + lower_bound=args["lower_bound"], + upper_bound=args["upper_bound"], + ) if constraints is not None: if safe_string_eq(constraints, "center"): # Now we can compute centering constraints @@ -661,24 +683,32 @@ def memorize_finish(self): if self._cyclic: df_before_constraints -= 1 if constraints.shape[1] != df_before_constraints: - raise ValueError("Constraints array should have %r columns but" - " %r found." - % (df_before_constraints, constraints.shape[1])) + raise ValueError( + "Constraints array should have %r columns but" + " %r found." % (df_before_constraints, constraints.shape[1]) + ) self._constraints = constraints - def transform(self, x, df=None, knots=None, - lower_bound=None, upper_bound=None, - constraints=None): + def transform( + self, + x, + df=None, + knots=None, + lower_bound=None, + upper_bound=None, + constraints=None, + ): x_orig = x x = np.atleast_1d(x) if x.ndim == 2 and x.shape[1] == 1: x = x[:, 0] if x.ndim > 1: - raise ValueError("Input to %r must be 1-d, " - "or a 2-d column vector." - % (self._name,)) - dm = _get_crs_dmatrix(x, self._all_knots, - self._constraints, cyclic=self._cyclic) + raise ValueError( + "Input to %r must be 1-d, " "or a 2-d column vector." % (self._name,) + ) + dm = _get_crs_dmatrix( + x, self._all_knots, self._constraints, cyclic=self._cyclic + ) if have_pandas: if isinstance(x_orig, (pandas.Series, pandas.DataFrame)): dm = pandas.DataFrame(dm) @@ -714,7 +744,8 @@ class CR(CubicRegressionSpline): __doc__ += CubicRegressionSpline.common_doc def __init__(self): - CubicRegressionSpline.__init__(self, name='cr', cyclic=False) + CubicRegressionSpline.__init__(self, name="cr", cyclic=False) + cr = stateful_transform(CR) @@ -744,26 +775,31 @@ class CC(CubicRegressionSpline): __doc__ += CubicRegressionSpline.common_doc def __init__(self): - CubicRegressionSpline.__init__(self, name='cc', cyclic=True) + CubicRegressionSpline.__init__(self, name="cc", cyclic=True) + cc = stateful_transform(CC) def test_crs_errors(): import pytest + # Invalid 'x' shape pytest.raises(ValueError, cr, np.arange(16).reshape((4, 4)), df=4) - pytest.raises(ValueError, CR().transform, - np.arange(16).reshape((4, 4)), df=4) + pytest.raises(ValueError, CR().transform, np.arange(16).reshape((4, 4)), df=4) # Should provide at least 'df' or 'knots' pytest.raises(ValueError, cr, np.arange(50)) # Invalid constraints shape - pytest.raises(ValueError, cr, np.arange(50), df=4, - constraints=np.arange(27).reshape((3, 3, 3))) + pytest.raises( + ValueError, + cr, + np.arange(50), + df=4, + constraints=np.arange(27).reshape((3, 3, 3)), + ) # Invalid nb of columns in constraints # (should have df + 1 = 5, but 6 provided) - pytest.raises(ValueError, cr, np.arange(50), df=4, - constraints=np.arange(6)) + pytest.raises(ValueError, cr, np.arange(50), df=4, constraints=np.arange(6)) # Too small 'df' for natural cubic spline pytest.raises(ValueError, cr, np.arange(50), df=1) # Too small 'df' for cyclic cubic spline @@ -772,9 +808,12 @@ def test_crs_errors(): def test_crs_compat(): from patsy.test_state import check_stateful - from patsy.test_splines_crs_data import (R_crs_test_x, - R_crs_test_data, - R_crs_num_tests) + from patsy.test_splines_crs_data import ( + R_crs_test_x, + R_crs_test_data, + R_crs_num_tests, + ) + lines = R_crs_test_data.split("\n") tests_ran = 0 start_idx = lines.index("--BEGIN TEST CASE--") @@ -796,8 +835,9 @@ def test_crs_compat(): spline_type = CC adjust_df += 1 else: - raise ValueError("Unrecognized spline type %r" - % (test_data["spline_type"],)) + raise ValueError( + "Unrecognized spline type %r" % (test_data["spline_type"],) + ) kwargs = {} if test_data["absorb_cons"] == "TRUE": kwargs["constraints"] = "center" @@ -818,37 +858,53 @@ def test_crs_compat(): start_idx = stop_idx + 1 assert tests_ran == R_crs_num_tests + test_crs_compat.slow = True + def test_crs_with_specific_constraint(): from patsy.highlevel import incr_dbuilder, build_design_matrices, dmatrix - x = (-1.5)**np.arange(20) + + x = (-1.5) ** np.arange(20) # Hard coded R values for smooth: s(x, bs="cr", k=5) # R> knots <- smooth$xp - knots_R = np.array([-2216.837820053100585937, - -50.456909179687500000, - -0.250000000000000000, - 33.637939453125000000, - 1477.891880035400390625]) + knots_R = np.array( + [ + -2216.837820053100585937, + -50.456909179687500000, + -0.250000000000000000, + 33.637939453125000000, + 1477.891880035400390625, + ] + ) # R> centering.constraint <- t(qr.X(attr(smooth, "qrc"))) - centering_constraint_R = np.array([[0.064910676323168478574, - 1.4519875239407085132, - -2.1947446912471946234, - 1.6129783104357671153, - 0.064868180547550072235]]) + centering_constraint_R = np.array( + [ + [ + 0.064910676323168478574, + 1.4519875239407085132, + -2.1947446912471946234, + 1.6129783104357671153, + 0.064868180547550072235, + ] + ] + ) # values for which we want a prediction - new_x = np.array([-3000., -200., 300., 2000.]) - result1 = dmatrix("cr(new_x, knots=knots_R[1:-1], " - "lower_bound=knots_R[0], upper_bound=knots_R[-1], " - "constraints=centering_constraint_R)") + new_x = np.array([-3000.0, -200.0, 300.0, 2000.0]) + result1 = dmatrix( + "cr(new_x, knots=knots_R[1:-1], " + "lower_bound=knots_R[0], upper_bound=knots_R[-1], " + "constraints=centering_constraint_R)" + ) data_chunked = [{"x": x[:10]}, {"x": x[10:]}] new_data = {"x": new_x} - builder = incr_dbuilder("cr(x, df=4, constraints='center')", - lambda: iter(data_chunked)) + builder = incr_dbuilder( + "cr(x, df=4, constraints='center')", lambda: iter(data_chunked) + ) result2 = build_design_matrices([builder], new_data)[0] - assert np.allclose(result1, result2, rtol=1e-12, atol=0.) + assert np.allclose(result1, result2, rtol=1e-12, atol=0.0) class TE(object): @@ -888,20 +944,22 @@ class TE(object): .. versionadded:: 0.3.0 """ + def __init__(self): self._tmp = {} self._constraints = None def memorize_chunk(self, *args, **kwargs): - constraints = self._tmp.setdefault("constraints", - kwargs.get("constraints")) + constraints = self._tmp.setdefault("constraints", kwargs.get("constraints")) if safe_string_eq(constraints, "center"): args_2d = [] for arg in args: arg = atleast_2d_column_default(arg) if arg.ndim != 2: - raise ValueError("Each tensor product argument must be " - "a 2-d array or 1-d vector.") + raise ValueError( + "Each tensor product argument must be " + "a 2-d array or 1-d vector." + ) args_2d.append(arg) tp = _row_tensor_product(args_2d) @@ -924,8 +982,7 @@ def memorize_finish(self): else: constraints = np.atleast_2d(constraints) if constraints.ndim != 2: - raise ValueError("Constraints must be 2-d array or " - "1-d vector.") + raise ValueError("Constraints must be 2-d array or " "1-d vector.") self._constraints = constraints @@ -934,159 +991,263 @@ def transform(self, *args, **kwargs): for arg in args: arg = atleast_2d_column_default(arg) if arg.ndim != 2: - raise ValueError("Each tensor product argument must be " - "a 2-d array or 1-d vector.") + raise ValueError( + "Each tensor product argument must be " "a 2-d array or 1-d vector." + ) args_2d.append(arg) return _get_te_dmatrix(args_2d, self._constraints) __getstate__ = no_pickling + te = stateful_transform(TE) def test_te_errors(): import pytest + x = np.arange(27) # Invalid input shape pytest.raises(ValueError, te, x.reshape((3, 3, 3))) - pytest.raises(ValueError, te, x.reshape((3, 3, 3)), constraints='center') + pytest.raises(ValueError, te, x.reshape((3, 3, 3)), constraints="center") # Invalid constraints shape - pytest.raises(ValueError, te, x, - constraints=np.arange(8).reshape((2, 2, 2))) + pytest.raises(ValueError, te, x, constraints=np.arange(8).reshape((2, 2, 2))) def test_te_1smooth(): from patsy.splines import bs + # Tensor product of 1 smooth covariate should be the same # as the smooth alone - x = (-1.5)**np.arange(20) + x = (-1.5) ** np.arange(20) assert np.allclose(cr(x, df=6), te(cr(x, df=6))) assert np.allclose(cc(x, df=5), te(cc(x, df=5))) assert np.allclose(bs(x, df=4), te(bs(x, df=4))) # Adding centering constraint to tensor product - assert np.allclose(cr(x, df=3, constraints='center'), - te(cr(x, df=4), constraints='center')) + assert np.allclose( + cr(x, df=3, constraints="center"), te(cr(x, df=4), constraints="center") + ) # Adding specific constraint center_constraint = np.arange(1, 5) - assert np.allclose(cr(x, df=3, constraints=center_constraint), - te(cr(x, df=4), constraints=center_constraint)) + assert np.allclose( + cr(x, df=3, constraints=center_constraint), + te(cr(x, df=4), constraints=center_constraint), + ) def test_te_2smooths(): from patsy.highlevel import incr_dbuilder, build_design_matrices - x1 = (-1.5)**np.arange(20) - x2 = (1.6)**np.arange(20) + + x1 = (-1.5) ** np.arange(20) + x2 = (1.6) ** np.arange(20) # Hard coded R results for smooth: te(x1, x2, bs=c("cs", "cc"), k=c(5,7)) # Without centering constraint: - dmatrix_R_nocons = \ - np.array([[-4.4303024184609255207e-06, 7.9884438387230142235e-06, - 9.7987758194797719025e-06, -7.2894213245475212959e-08, - 1.5907686862964493897e-09, -3.2565884983072595159e-11, - 0.0170749607855874667439, -3.0788499835965849050e-02, - -3.7765754357352458725e-02, 2.8094376299826799787e-04, - -6.1310290747349201414e-06, 1.2551314933193442915e-07, - -0.26012671685838206770, 4.6904420337437874311e-01, - 0.5753384627946153129230, -4.2800085814700449330e-03, - 9.3402525733484874533e-05, -1.9121170389937518131e-06, - -0.0904312240489447832781, 1.6305991924427923334e-01, - 2.0001237112941641638e-01, -1.4879148887003382663e-03, - 3.2470731316462736135e-05, -6.6473404365914134499e-07, - 2.0447857920168824846e-05, -3.6870296695050991799e-05, - -4.5225801045409022233e-05, 3.3643990293641665710e-07, - -7.3421200200015877329e-09, 1.5030635073660743297e-10], - [-9.4006130602653794302e-04, 7.8681398069163730347e-04, - 2.4573006857381437217e-04, -1.4524712230452725106e-04, - 7.8216741353106329551e-05, -3.1304283003914264551e-04, - 3.6231183382798337611064, -3.0324832476174168328e+00, - -9.4707559178211142559e-01, 5.5980126937492580286e-01, - -3.0145747744342332730e-01, 1.2065077148806895302e+00, - -35.17561267504181188315, 2.9441339255948005160e+01, - 9.1948319320782125885216, -5.4349184288245195873e+00, - 2.9267472035096449012e+00, -1.1713569391233907169e+01, - 34.0275626863976370373166, -2.8480442582712722555e+01, - -8.8947340548151565542e+00, 5.2575353623762932642e+00, - -2.8312249982592527786e+00, 1.1331265795534763541e+01, - 7.9462158845078978420e-01, -6.6508361863670617531e-01, - -2.0771242914526857892e-01, 1.2277550230353953542e-01, - -6.6115593588420035198e-02, 2.6461103043402139923e-01]]) + dmatrix_R_nocons = np.array( + [ + [ + -4.4303024184609255207e-06, + 7.9884438387230142235e-06, + 9.7987758194797719025e-06, + -7.2894213245475212959e-08, + 1.5907686862964493897e-09, + -3.2565884983072595159e-11, + 0.0170749607855874667439, + -3.0788499835965849050e-02, + -3.7765754357352458725e-02, + 2.8094376299826799787e-04, + -6.1310290747349201414e-06, + 1.2551314933193442915e-07, + -0.26012671685838206770, + 4.6904420337437874311e-01, + 0.5753384627946153129230, + -4.2800085814700449330e-03, + 9.3402525733484874533e-05, + -1.9121170389937518131e-06, + -0.0904312240489447832781, + 1.6305991924427923334e-01, + 2.0001237112941641638e-01, + -1.4879148887003382663e-03, + 3.2470731316462736135e-05, + -6.6473404365914134499e-07, + 2.0447857920168824846e-05, + -3.6870296695050991799e-05, + -4.5225801045409022233e-05, + 3.3643990293641665710e-07, + -7.3421200200015877329e-09, + 1.5030635073660743297e-10, + ], + [ + -9.4006130602653794302e-04, + 7.8681398069163730347e-04, + 2.4573006857381437217e-04, + -1.4524712230452725106e-04, + 7.8216741353106329551e-05, + -3.1304283003914264551e-04, + 3.6231183382798337611064, + -3.0324832476174168328e00, + -9.4707559178211142559e-01, + 5.5980126937492580286e-01, + -3.0145747744342332730e-01, + 1.2065077148806895302e00, + -35.17561267504181188315, + 2.9441339255948005160e01, + 9.1948319320782125885216, + -5.4349184288245195873e00, + 2.9267472035096449012e00, + -1.1713569391233907169e01, + 34.0275626863976370373166, + -2.8480442582712722555e01, + -8.8947340548151565542e00, + 5.2575353623762932642e00, + -2.8312249982592527786e00, + 1.1331265795534763541e01, + 7.9462158845078978420e-01, + -6.6508361863670617531e-01, + -2.0771242914526857892e-01, + 1.2277550230353953542e-01, + -6.6115593588420035198e-02, + 2.6461103043402139923e-01, + ], + ] + ) # With centering constraint: - dmatrix_R_cons = \ - np.array([[0.00329998606323867252343, 1.6537431155796576600e-04, - -1.2392262709790753433e-04, 6.5405304166706783407e-05, - -6.6764045799537624095e-05, -0.1386431081763726258504, - 0.124297283800864313830, -3.5487293655619825405e-02, - -3.0527115315785902268e-03, 5.2009247643311604277e-04, - -0.00384203992301702674378, -0.058901915802819435064, - 0.266422358491648914036, 0.5739281693874087597607, - -1.3171008503525844392e-03, 8.2573456631878912413e-04, - 6.6730833453016958831e-03, -0.1467677784718444955470, - 0.220757650934837484913, 0.1983127687880171796664, - -1.6269930328365173316e-03, -1.7785892412241208812e-03, - -3.2702835436351201243e-03, -4.3252183044300757109e-02, - 4.3403766976235179376e-02, 3.5973406402893762387e-05, - -5.4035858568225075046e-04, 2.9565209382794241247e-04, - -2.2769990750264097637e-04], - [0.41547954838956052681098, 1.9843570584107707994e-02, - -1.5746590234791378593e-02, 8.3171184312221431434e-03, - -8.7233014052017516377e-03, -15.9926770785086258541696, - 16.503663226274017716833, -6.6005803955894726265e-01, - 1.3986092022708346283e-01, -2.3516913533670955050e-01, - 0.72251037497207359905360, -9.827337059999853963177, - 3.917078117294827688255, 9.0171773596973618936090, - -5.0616811270787671617e+00, 3.0189990249009683865e+00, - -1.0872720629943064097e+01, 26.9308504460453121964747, - -21.212262927009287949431, -9.1088328555582247503253, - 5.2400156972500298025e+00, -3.0593641098325474736e+00, - 1.0919392118399086300e+01, -4.6564290223265718538e+00, - 4.8071307441606982991e+00, -1.9748377005689798924e-01, - 5.4664183716965096538e-02, -2.8871392916916285148e-02, - 2.3592766838010845176e-01]]) + dmatrix_R_cons = np.array( + [ + [ + 0.00329998606323867252343, + 1.6537431155796576600e-04, + -1.2392262709790753433e-04, + 6.5405304166706783407e-05, + -6.6764045799537624095e-05, + -0.1386431081763726258504, + 0.124297283800864313830, + -3.5487293655619825405e-02, + -3.0527115315785902268e-03, + 5.2009247643311604277e-04, + -0.00384203992301702674378, + -0.058901915802819435064, + 0.266422358491648914036, + 0.5739281693874087597607, + -1.3171008503525844392e-03, + 8.2573456631878912413e-04, + 6.6730833453016958831e-03, + -0.1467677784718444955470, + 0.220757650934837484913, + 0.1983127687880171796664, + -1.6269930328365173316e-03, + -1.7785892412241208812e-03, + -3.2702835436351201243e-03, + -4.3252183044300757109e-02, + 4.3403766976235179376e-02, + 3.5973406402893762387e-05, + -5.4035858568225075046e-04, + 2.9565209382794241247e-04, + -2.2769990750264097637e-04, + ], + [ + 0.41547954838956052681098, + 1.9843570584107707994e-02, + -1.5746590234791378593e-02, + 8.3171184312221431434e-03, + -8.7233014052017516377e-03, + -15.9926770785086258541696, + 16.503663226274017716833, + -6.6005803955894726265e-01, + 1.3986092022708346283e-01, + -2.3516913533670955050e-01, + 0.72251037497207359905360, + -9.827337059999853963177, + 3.917078117294827688255, + 9.0171773596973618936090, + -5.0616811270787671617e00, + 3.0189990249009683865e00, + -1.0872720629943064097e01, + 26.9308504460453121964747, + -21.212262927009287949431, + -9.1088328555582247503253, + 5.2400156972500298025e00, + -3.0593641098325474736e00, + 1.0919392118399086300e01, + -4.6564290223265718538e00, + 4.8071307441606982991e00, + -1.9748377005689798924e-01, + 5.4664183716965096538e-02, + -2.8871392916916285148e-02, + 2.3592766838010845176e-01, + ], + ] + ) new_x1 = np.array([11.390625, 656.84083557128906250]) new_x2 = np.array([16.777216000000006346, 1844.6744073709567147]) new_data = {"x1": new_x1, "x2": new_x2} - data_chunked = [{"x1": x1[:10], "x2": x2[:10]}, - {"x1": x1[10:], "x2": x2[10:]}] + data_chunked = [{"x1": x1[:10], "x2": x2[:10]}, {"x1": x1[10:], "x2": x2[10:]}] - builder = incr_dbuilder("te(cr(x1, df=5), cc(x2, df=6)) - 1", - lambda: iter(data_chunked)) + builder = incr_dbuilder( + "te(cr(x1, df=5), cc(x2, df=6)) - 1", lambda: iter(data_chunked) + ) dmatrix_nocons = build_design_matrices([builder], new_data)[0] - assert np.allclose(dmatrix_nocons, dmatrix_R_nocons, rtol=1e-12, atol=0.) + assert np.allclose(dmatrix_nocons, dmatrix_R_nocons, rtol=1e-12, atol=0.0) - builder = incr_dbuilder("te(cr(x1, df=5), cc(x2, df=6), " - "constraints='center') - 1", - lambda: iter(data_chunked)) + builder = incr_dbuilder( + "te(cr(x1, df=5), cc(x2, df=6), " "constraints='center') - 1", + lambda: iter(data_chunked), + ) dmatrix_cons = build_design_matrices([builder], new_data)[0] - assert np.allclose(dmatrix_cons, dmatrix_R_cons, rtol=1e-12, atol=0.) + assert np.allclose(dmatrix_cons, dmatrix_R_cons, rtol=1e-12, atol=0.0) def test_te_3smooths(): from patsy.highlevel import incr_dbuilder, build_design_matrices - x1 = (-1.5)**np.arange(20) - x2 = (1.6)**np.arange(20) - x3 = (-1.2)**np.arange(20) + + x1 = (-1.5) ** np.arange(20) + x2 = (1.6) ** np.arange(20) + x3 = (-1.2) ** np.arange(20) # Hard coded R results for smooth: te(x1, x2, x3, bs=c("cr", "cs", "cc"), k=c(3,3,4)) - design_matrix_R = \ - np.array([[7.2077663709837084334e-05, 2.0648333344343273131e-03, - -4.7934014082310591768e-04, 2.3923430783992746568e-04, - 6.8534265421922660466e-03, -1.5909867344112936776e-03, - -6.8057712777151204314e-09, -1.9496724335203412851e-07, - 4.5260614658693259131e-08, 0.0101479754187435277507, - 0.290712501531622591333, -0.067487370093906928759, - 0.03368233306025386619709, 0.9649092451763204847381, - -0.2239985793289433757547, -9.5819975394704535133e-07, - -2.7449874082511405643e-05, 6.3723431275833230217e-06, - -1.5205851762850489204e-04, -0.00435607204539782688624, - 0.00101123909269346416370, -5.0470024059694933508e-04, - -1.4458319360584082416e-02, 3.3564223914790921634e-03, - 1.4357783514933466209e-08, 4.1131230514870551983e-07, - -9.5483976834512651038e-08]]) - new_data = {"x1": -38.443359375000000000, - "x2": 68.719476736000032702, - "x3": -5.1597803519999985156} - data_chunked = [{"x1": x1[:10], "x2": x2[:10], "x3": x3[:10]}, - {"x1": x1[10:], "x2": x2[10:], "x3": x3[10:]}] - builder = incr_dbuilder("te(cr(x1, df=3), cr(x2, df=3), cc(x3, df=3)) - 1", - lambda: iter(data_chunked)) + design_matrix_R = np.array( + [ + [ + 7.2077663709837084334e-05, + 2.0648333344343273131e-03, + -4.7934014082310591768e-04, + 2.3923430783992746568e-04, + 6.8534265421922660466e-03, + -1.5909867344112936776e-03, + -6.8057712777151204314e-09, + -1.9496724335203412851e-07, + 4.5260614658693259131e-08, + 0.0101479754187435277507, + 0.290712501531622591333, + -0.067487370093906928759, + 0.03368233306025386619709, + 0.9649092451763204847381, + -0.2239985793289433757547, + -9.5819975394704535133e-07, + -2.7449874082511405643e-05, + 6.3723431275833230217e-06, + -1.5205851762850489204e-04, + -0.00435607204539782688624, + 0.00101123909269346416370, + -5.0470024059694933508e-04, + -1.4458319360584082416e-02, + 3.3564223914790921634e-03, + 1.4357783514933466209e-08, + 4.1131230514870551983e-07, + -9.5483976834512651038e-08, + ] + ] + ) + new_data = { + "x1": -38.443359375000000000, + "x2": 68.719476736000032702, + "x3": -5.1597803519999985156, + } + data_chunked = [ + {"x1": x1[:10], "x2": x2[:10], "x3": x3[:10]}, + {"x1": x1[10:], "x2": x2[10:], "x3": x3[10:]}, + ] + builder = incr_dbuilder( + "te(cr(x1, df=3), cr(x2, df=3), cc(x3, df=3)) - 1", lambda: iter(data_chunked) + ) design_matrix = build_design_matrices([builder], new_data)[0] - assert np.allclose(design_matrix, design_matrix_R, rtol=1e-12, atol=0.) + assert np.allclose(design_matrix, design_matrix_R, rtol=1e-12, atol=0.0) diff --git a/patsy/missing.py b/patsy/missing.py index 3235739..b4d8a01 100644 --- a/patsy/missing.py +++ b/patsy/missing.py @@ -38,17 +38,19 @@ import numpy as np from patsy import PatsyError -from patsy.util import (safe_isnan, safe_scalar_isnan, - no_pickling, assert_no_pickling) +from patsy.util import safe_isnan, safe_scalar_isnan, no_pickling, assert_no_pickling # These are made available in the patsy.* namespace __all__ = ["NAAction"] _valid_NA_types = ["None", "NaN"] _valid_NA_responses = ["raise", "drop"] + + def _desc_options(options): return ", ".join([repr(opt) for opt in options]) + class NAAction(object): """An :class:`NAAction` object defines a strategy for handling missing data. @@ -85,6 +87,7 @@ class NAAction(object): instance of this class, or your own object that implements the same interface, and pass that as the ``NA_action=`` argument instead. """ + def __init__(self, on_NA="drop", NA_types=["None", "NaN"]): """The :class:`NAAction` constructor takes the following arguments: @@ -104,17 +107,19 @@ def __init__(self, on_NA="drop", NA_types=["None", "NaN"]): """ self.on_NA = on_NA if self.on_NA not in _valid_NA_responses: - raise ValueError("invalid on_NA action %r " - "(should be one of %s)" - % (on_NA, _desc_options(_valid_NA_responses))) + raise ValueError( + "invalid on_NA action %r " + "(should be one of %s)" % (on_NA, _desc_options(_valid_NA_responses)) + ) if isinstance(NA_types, str): raise ValueError("NA_types should be a list of strings") self.NA_types = tuple(NA_types) for NA_type in self.NA_types: if NA_type not in _valid_NA_types: - raise ValueError("invalid NA_type %r " - "(should be one of %s)" - % (NA_type, _desc_options(_valid_NA_types))) + raise ValueError( + "invalid NA_type %r " + "(should be one of %s)" % (NA_type, _desc_options(_valid_NA_types)) + ) def is_categorical_NA(self, obj): """Return True if `obj` is a categorical NA value. @@ -163,7 +168,7 @@ def handle_NA(self, values, is_NAs, origins): return self._handle_NA_raise(values, is_NAs, origins) elif self.on_NA == "drop": return self._handle_NA_drop(values, is_NAs, origins) - else: # pragma: no cover + else: # pragma: no cover assert False def _handle_NA_raise(self, values, is_NAs, origins): @@ -182,14 +187,17 @@ def _handle_NA_drop(self, values, is_NAs, origins): __getstate__ = no_pickling + def test_NAAction_basic(): import pytest + pytest.raises(ValueError, NAAction, on_NA="pord") pytest.raises(ValueError, NAAction, NA_types=("NaN", "asdf")) pytest.raises(ValueError, NAAction, NA_types="NaN") assert_no_pickling(NAAction()) + def test_NAAction_NA_types_numerical(): for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]: action = NAAction(NA_types=NA_types) @@ -206,6 +214,7 @@ def test_NAAction_NA_types_numerical(): got_NA_mask = action.is_numerical_NA(arr) assert np.array_equal(got_NA_mask, exp_NA_mask) + def test_NAAction_NA_types_categorical(): for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]: action = NAAction(NA_types=NA_types) @@ -214,47 +223,45 @@ def test_NAAction_NA_types_categorical(): assert action.is_categorical_NA(None) == ("None" in NA_types) assert action.is_categorical_NA(np.nan) == ("NaN" in NA_types) + def test_NAAction_drop(): action = NAAction("drop") - in_values = [np.asarray([-1, 2, -1, 4, 5]), - np.asarray([10.0, 20.0, 30.0, 40.0, 50.0]), - np.asarray([[1.0, np.nan], - [3.0, 4.0], - [10.0, 5.0], - [6.0, 7.0], - [8.0, np.nan]]), - ] - is_NAs = [np.asarray([True, False, True, False, False]), - np.zeros(5, dtype=bool), - np.asarray([True, False, False, False, True]), - ] + in_values = [ + np.asarray([-1, 2, -1, 4, 5]), + np.asarray([10.0, 20.0, 30.0, 40.0, 50.0]), + np.asarray([[1.0, np.nan], [3.0, 4.0], [10.0, 5.0], [6.0, 7.0], [8.0, np.nan]]), + ] + is_NAs = [ + np.asarray([True, False, True, False, False]), + np.zeros(5, dtype=bool), + np.asarray([True, False, False, False, True]), + ] out_values = action.handle_NA(in_values, is_NAs, [None] * 3) assert len(out_values) == 3 assert np.array_equal(out_values[0], [2, 4]) assert np.array_equal(out_values[1], [20.0, 40.0]) assert np.array_equal(out_values[2], [[3.0, 4.0], [6.0, 7.0]]) + def test_NAAction_raise(): action = NAAction(on_NA="raise") # no-NA just passes through: - in_arrs = [np.asarray([1.1, 1.2]), - np.asarray([1, 2])] + in_arrs = [np.asarray([1.1, 1.2]), np.asarray([1, 2])] is_NAs = [np.asarray([False, False])] * 2 got_arrs = action.handle_NA(in_arrs, is_NAs, [None, None]) assert np.array_equal(got_arrs[0], in_arrs[0]) assert np.array_equal(got_arrs[1], in_arrs[1]) from patsy.origin import Origin + o1 = Origin("asdf", 0, 1) o2 = Origin("asdf", 2, 3) # NA raises an error with a correct origin in_idx = np.arange(2) - in_arrs = [np.asarray([1.1, 1.2]), - np.asarray([1.0, np.nan])] - is_NAs = [np.asarray([False, False]), - np.asarray([False, True])] + in_arrs = [np.asarray([1.1, 1.2]), np.asarray([1.0, np.nan])] + is_NAs = [np.asarray([False, False]), np.asarray([False, True])] try: action.handle_NA(in_arrs, is_NAs, [o1, o2]) assert False diff --git a/patsy/origin.py b/patsy/origin.py index 68ed71a..fcabf21 100644 --- a/patsy/origin.py +++ b/patsy/origin.py @@ -10,6 +10,7 @@ # These are made available in the patsy.* namespace __all__ = ["Origin"] + class Origin(object): """This represents the origin of some object in some string. @@ -52,7 +53,7 @@ def combine(cls, origin_objs): * ``None`` * An object that has a ``.origin`` attribute which fulfills the above criteria. - + Returns either an Origin object, or None. """ origins = [] @@ -73,13 +74,15 @@ def combine(cls, origin_objs): def relevant_code(self): """Extracts and returns the span of the original code represented by this Origin. Example: ``x1``.""" - return self.code[self.start:self.end] + return self.code[self.start : self.end] def __eq__(self, other): - return (isinstance(other, Origin) - and self.code == other.code - and self.start == other.start - and self.end == other.end) + return ( + isinstance(other, Origin) + and self.code == other.code + and self.start == other.start + and self.end == other.end + ) def __ne__(self, other): return not self == other @@ -98,24 +101,28 @@ def caretize(self, indent=0): indented by this much. The returned string does not have a trailing newline. """ - return ("%s%s\n%s%s%s" - % (" " * indent, - self.code, - " " * indent, - " " * self.start, - "^" * (self.end - self.start))) + return "%s%s\n%s%s%s" % ( + " " * indent, + self.code, + " " * indent, + " " * self.start, + "^" * (self.end - self.start), + ) def __repr__(self): return "%s<-%s (%s-%s)>" % ( - self.code[:self.start], - self.code[self.start:self.end], - self.code[self.end:], - self.start, self.end) + self.code[: self.start], + self.code[self.start : self.end], + self.code[self.end :], + self.start, + self.end, + ) # We reimplement patsy.util.no_pickling, to avoid circular import issues def __getstate__(self): raise NotImplementedError + def test_Origin(): o1 = Origin("012345", 2, 4) o2 = Origin("012345", 4, 5) @@ -131,6 +138,7 @@ def test_Origin(): class ObjWithOrigin(object): def __init__(self, origin=None): self.origin = origin + o4 = Origin.combine([ObjWithOrigin(o1), ObjWithOrigin(), None]) assert o4 == o1 o5 = Origin.combine([ObjWithOrigin(o1), o2]) @@ -139,4 +147,5 @@ def __init__(self, origin=None): assert Origin.combine([ObjWithOrigin(), ObjWithOrigin()]) is None from patsy.util import assert_no_pickling + assert_no_pickling(Origin("", 0, 0)) diff --git a/patsy/parse_formula.py b/patsy/parse_formula.py index afab2d4..8d0c615 100644 --- a/patsy/parse_formula.py +++ b/patsy/parse_formula.py @@ -1,4 +1,4 @@ - # This file is part of Patsy +# This file is part of Patsy # Copyright (C) 2011 Nathaniel Smith # See file LICENSE.txt for license information. @@ -23,6 +23,7 @@ _atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "NUMBER"] + def _is_a(f, v): try: f(v) @@ -31,6 +32,7 @@ def _is_a(f, v): else: return True + # Helper function for _tokenize_formula: def _read_python_expr(it, end_tokens): # Read out a full python expression, stopping when we hit an @@ -66,16 +68,18 @@ def _read_python_expr(it, end_tokens): token_type = "PYTHON_EXPR" return Token(token_type, Origin.combine(origins), extra=expr_text) else: - raise PatsyError("unclosed bracket in embedded Python " - "expression", - Origin.combine(origins)) + raise PatsyError( + "unclosed bracket in embedded Python " "expression", Origin.combine(origins) + ) + def _tokenize_formula(code, operator_strings): assert "(" not in operator_strings assert ")" not in operator_strings - magic_token_types = {"(": Token.LPAREN, - ")": Token.RPAREN, - } + magic_token_types = { + "(": Token.LPAREN, + ")": Token.RPAREN, + } for operator_string in operator_strings: magic_token_types[operator_string] = operator_string # Once we enter a Python expression, a ( does not end it, but any other @@ -91,46 +95,48 @@ def _tokenize_formula(code, operator_strings): it.push_back((pytype, token_string, origin)) yield _read_python_expr(it, end_tokens) + def test__tokenize_formula(): code = "y ~ a + (foo(b,c + 2)) + -1 + 0 + 10" tokens = list(_tokenize_formula(code, ["+", "-", "~"])) - expecteds = [("PYTHON_EXPR", Origin(code, 0, 1), "y"), - ("~", Origin(code, 2, 3), None), - ("PYTHON_EXPR", Origin(code, 4, 5), "a"), - ("+", Origin(code, 6, 7), None), - (Token.LPAREN, Origin(code, 8, 9), None), - ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"), - (Token.RPAREN, Origin(code, 23, 24), None), - ("+", Origin(code, 25, 26), None), - ("-", Origin(code, 27, 28), None), - ("ONE", Origin(code, 28, 29), "1"), - ("+", Origin(code, 30, 31), None), - ("ZERO", Origin(code, 32, 33), "0"), - ("+", Origin(code, 34, 35), None), - ("NUMBER", Origin(code, 36, 38), "10"), - ] + expecteds = [ + ("PYTHON_EXPR", Origin(code, 0, 1), "y"), + ("~", Origin(code, 2, 3), None), + ("PYTHON_EXPR", Origin(code, 4, 5), "a"), + ("+", Origin(code, 6, 7), None), + (Token.LPAREN, Origin(code, 8, 9), None), + ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"), + (Token.RPAREN, Origin(code, 23, 24), None), + ("+", Origin(code, 25, 26), None), + ("-", Origin(code, 27, 28), None), + ("ONE", Origin(code, 28, 29), "1"), + ("+", Origin(code, 30, 31), None), + ("ZERO", Origin(code, 32, 33), "0"), + ("+", Origin(code, 34, 35), None), + ("NUMBER", Origin(code, 36, 38), "10"), + ] for got, expected in zip(tokens, expecteds): assert isinstance(got, Token) assert got.type == expected[0] assert got.origin == expected[1] assert got.extra == expected[2] + _unary_tilde = Operator("~", 1, -100) _default_ops = [ _unary_tilde, Operator("~", 2, -100), - Operator("+", 2, 100), Operator("-", 2, 100), Operator("*", 2, 200), Operator("/", 2, 200), Operator(":", 2, 300), Operator("**", 2, 500), - Operator("+", 1, 100), Operator("-", 1, 100), ] + def parse_formula(code, extra_operators=[]): if not code.strip(): code = "~ 1" @@ -141,35 +147,31 @@ def parse_formula(code, extra_operators=[]): operators = _default_ops + extra_operators operator_strings = [op.token_type for op in operators] - tree = infix_parse(_tokenize_formula(code, operator_strings), - operators, - _atomic_token_types) + tree = infix_parse( + _tokenize_formula(code, operator_strings), operators, _atomic_token_types + ) if not isinstance(tree, ParseNode) or tree.type != "~": tree = ParseNode("~", None, [tree], tree.origin) return tree + ############# _parser_tests = { "": ["~", "1"], " ": ["~", "1"], " \n ": ["~", "1"], - "1": ["~", "1"], "a": ["~", "a"], "a ~ b": ["~", "a", "b"], - "(a ~ b)": ["~", "a", "b"], "a ~ ((((b))))": ["~", "a", "b"], "a ~ ((((+b))))": ["~", "a", ["+", "b"]], - "a + b + c": ["~", ["+", ["+", "a", "b"], "c"]], "a + (b ~ c) + d": ["~", ["+", ["+", "a", ["~", "b", "c"]], "d"]], - "a + np.log(a, base=10)": ["~", ["+", "a", "np.log(a, base=10)"]], # Note different spacing: "a + np . log(a , base = 10)": ["~", ["+", "a", "np.log(a, base=10)"]], - # Check precedence "a + b ~ c * d": ["~", ["+", "a", "b"], ["*", "c", "d"]], "a + b * c": ["~", ["+", "a", ["*", "b", "c"]]], @@ -178,12 +180,11 @@ def parse_formula(code, extra_operators=[]): "a + b:c": ["~", ["+", "a", [":", "b", "c"]]], "(a + b):c": ["~", [":", ["+", "a", "b"], "c"]], "a*b:c": ["~", ["*", "a", [":", "b", "c"]]], - "a+b / c": ["~", ["+", "a", ["/", "b", "c"]]], "~ a": ["~", "a"], - "-1": ["~", ["-", "1"]], - } +} + def _compare_trees(got, expected): assert isinstance(got, ParseNode) @@ -195,6 +196,7 @@ def _compare_trees(got, expected): assert got.type in _atomic_token_types assert got.token.extra == expected + def _do_parse_test(test_cases, extra_operators): for code, expected in test_cases.items(): actual = parse_formula(code, extra_operators=extra_operators) @@ -202,9 +204,11 @@ def _do_parse_test(test_cases, extra_operators): print(actual) _compare_trees(actual, expected) + def test_parse_formula(): _do_parse_test(_parser_tests, []) + def test_parse_origin(): tree = parse_formula("a ~ b + c") assert tree.origin == Origin("a ~ b + c", 0, 9) @@ -215,43 +219,36 @@ def test_parse_origin(): assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5) assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9) + # <> mark off where the error should be reported: _parser_error_tests = [ "a <+>", "a + <(>", - "a + b <# asdf>", - "<)>", "a + <)>", "<*> a", "a + <*>", - "a + ", "a + ", "a + ", - "a + <[bar>", "a + <{bar>", - "a + <{bar[]>", - "a + foo<]>bar", "a + foo[]<]>bar", "a + foo{}<}>bar", "a + foo<)>bar", - "a + b<)>", "(a) <.>", - "<(>a + b", - - "a +< >'foo", # Not the best placement for the error + "a +< >'foo", # Not the best placement for the error ] + # Split out so it can also be used by tests of the evaluator (which also # raises PatsyError's) -def _parsing_error_test(parse_fn, error_descs): # pragma: no cover +def _parsing_error_test(parse_fn, error_descs): # pragma: no cover for error_desc in error_descs: letters = [] start = None @@ -277,20 +274,22 @@ def _parsing_error_test(parse_fn, error_descs): # pragma: no cover else: assert False, "parser failed to report an error!" + def test_parse_errors(extra_operators=[]): def parse_fn(code): return parse_formula(code, extra_operators=extra_operators) + _parsing_error_test(parse_fn, _parser_error_tests) + _extra_op_parser_tests = { "a | b": ["~", ["|", "a", "b"]], "a * b|c": ["~", ["*", "a", ["|", "b", "c"]]], - } +} + def test_parse_extra_op(): extra_operators = [Operator("|", 2, 250)] - _do_parse_test(_parser_tests, - extra_operators=extra_operators) - _do_parse_test(_extra_op_parser_tests, - extra_operators=extra_operators) + _do_parse_test(_parser_tests, extra_operators=extra_operators) + _do_parse_test(_extra_op_parser_tests, extra_operators=extra_operators) test_parse_errors(extra_operators=extra_operators) diff --git a/patsy/redundancy.py b/patsy/redundancy.py index c428bdf..c81d439 100644 --- a/patsy/redundancy.py +++ b/patsy/redundancy.py @@ -42,6 +42,7 @@ from patsy.util import no_pickling + # This should really be a named tuple, but those don't exist until Python # 2.6... class _ExpandedFactor(object): @@ -49,6 +50,7 @@ class _ExpandedFactor(object): full-rank (includes_intercept=True) or not. These objects are treated as immutable.""" + def __init__(self, includes_intercept, factor): self.includes_intercept = includes_intercept self.factor = factor @@ -57,9 +59,11 @@ def __hash__(self): return hash((_ExpandedFactor, self.includes_intercept, self.factor)) def __eq__(self, other): - return (isinstance(other, _ExpandedFactor) - and other.includes_intercept == self.includes_intercept - and other.factor == self.factor) + return ( + isinstance(other, _ExpandedFactor) + and other.includes_intercept == self.includes_intercept + and other.factor == self.factor + ) def __ne__(self, other): return not self == other @@ -73,15 +77,18 @@ def __repr__(self): __getstate__ = no_pickling + class _Subterm(object): "Also immutable." + def __init__(self, efactors): self.efactors = frozenset(efactors) def can_absorb(self, other): # returns True if 'self' is like a-:b-, and 'other' is like a- - return (len(self.efactors) - len(other.efactors) == 1 - and self.efactors.issuperset(other.efactors)) + return len(self.efactors) - len( + other.efactors + ) == 1 and self.efactors.issuperset(other.efactors) def absorb(self, other): diff = self.efactors.difference(other.efactors) @@ -96,8 +103,7 @@ def __hash__(self): return hash((_Subterm, self.efactors)) def __eq__(self, other): - return (isinstance(other, _Subterm) - and self.efactors == self.efactors) + return isinstance(other, _Subterm) and self.efactors == self.efactors def __ne__(self, other): return not self == other @@ -107,6 +113,7 @@ def __repr__(self): __getstate__ = no_pickling + # For testing: takes a shorthand description of a list of subterms like # [(), ("a-",), ("a-", "b+")] # and expands it into a list of _Subterm and _ExpandedFactor objects. @@ -116,11 +123,11 @@ def _expand_test_abbrevs(short_subterms): factors = [] for factor_name in subterm: assert factor_name[-1] in ("+", "-") - factors.append(_ExpandedFactor(factor_name[-1] == "+", - factor_name[:-1])) + factors.append(_ExpandedFactor(factor_name[-1] == "+", factor_name[:-1])) subterms.append(_Subterm(factors)) return subterms + def test__Subterm(): s_ab = _expand_test_abbrevs([["a-", "b-"]])[0] s_abc = _expand_test_abbrevs([["a-", "b-", "c-"]])[0] @@ -134,6 +141,7 @@ def test__Subterm(): assert s_ab.can_absorb(s_a) assert s_ab.absorb(s_a) == s_abp + # Importantly, this preserves the order of the input. Both the items inside # each subset are in the order they were in the original tuple, and the tuples # are emitted so that they're sorted with respect to their elements position @@ -147,6 +155,7 @@ def helper(seq): for subset in _subsets_sorted(seq[1:]): yield subset yield (obj,) + subset + # Transform each obj -> (idx, obj) tuple, so that we can later sort them # by their position in the original list. expanded = list(enumerate(tupl)) @@ -159,29 +168,41 @@ def helper(seq): # And finally, we strip off the idx's: for subset in expanded_subsets: yield tuple([obj for (idx, obj) in subset]) - + + def test__subsets_sorted(): assert list(_subsets_sorted((1, 2))) == [(), (1,), (2,), (1, 2)] - assert (list(_subsets_sorted((1, 2, 3))) - == [(), (1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)]) - assert len(list(_subsets_sorted(range(5)))) == 2 ** 5 + assert list(_subsets_sorted((1, 2, 3))) == [ + (), + (1,), + (2,), + (3,), + (1, 2), + (1, 3), + (2, 3), + (1, 2, 3), + ] + assert len(list(_subsets_sorted(range(5)))) == 2**5 + def _simplify_one_subterm(subterms): # We simplify greedily from left to right. # Returns True if succeeded, False otherwise for short_i, short_subterm in enumerate(subterms): - for long_i, long_subterm in enumerate(subterms[short_i + 1:]): + for long_i, long_subterm in enumerate(subterms[short_i + 1 :]): if long_subterm.can_absorb(short_subterm): new_subterm = long_subterm.absorb(short_subterm) subterms[short_i + 1 + long_i] = new_subterm subterms.pop(short_i) return True return False - + + def _simplify_subterms(subterms): while _simplify_one_subterm(subterms): pass + def test__simplify_subterms(): def t(given, expected): given = _expand_test_abbrevs(given) @@ -189,12 +210,14 @@ def t(given, expected): print("testing if:", given, "->", expected) _simplify_subterms(given) assert given == expected + t([("a-",)], [("a-",)]) t([(), ("a-",)], [("a+",)]) t([(), ("a-",), ("b-",), ("a-", "b-")], [("a+", "b+")]) t([(), ("a-",), ("a-", "b-")], [("a+",), ("a-", "b-")]) t([("a-",), ("b-",), ("a-", "b-")], [("b-",), ("a-", "b+")]) + # 'term' is a Term # 'numeric_factors' is any set-like object which lists the # numeric/non-categorical factors in this term. Such factors are just @@ -235,8 +258,10 @@ def pick_contrasts_for_term(term, numeric_factors, used_subterms): factor_codings.append(factor_coding) return factor_codings + def test_pick_contrasts_for_term(): from patsy.desc import Term + used = set() codings = pick_contrasts_for_term(Term([]), set(), used) assert codings == [{}] diff --git a/patsy/splines.py b/patsy/splines.py index 2644900..6504b98 100644 --- a/patsy/splines.py +++ b/patsy/splines.py @@ -15,10 +15,11 @@ if have_pandas: import pandas + def _eval_bspline_basis(x, knots, degree): try: from scipy.interpolate import splev - except ImportError: # pragma: no cover + except ImportError: # pragma: no cover raise ImportError("spline functionality requires scipy") # 'knots' are assumed to be already pre-processed. E.g. usually you # want to include duplicate copies of boundary knots; you should do @@ -36,9 +37,11 @@ def _eval_bspline_basis(x, knots, degree): # this and decide what to do with it, I'm going to play it safe and # disallow such points. if np.min(x) < np.min(knots) or np.max(x) > np.max(knots): - raise NotImplementedError("some data points fall outside the " - "outermost knots, and I'm not sure how " - "to handle them. (Patches accepted!)") + raise NotImplementedError( + "some data points fall outside the " + "outermost knots, and I'm not sure how " + "to handle them. (Patches accepted!)" + ) # Thanks to Charles Harris for explaining splev. It's not well # documented, but basically it computes an arbitrary b-spline basis # given knots and degree on some specified points (or derivatives @@ -59,21 +62,26 @@ def _eval_bspline_basis(x, knots, degree): basis[:, i] = splev(x, (knots, coefs, degree)) return basis + def _R_compat_quantile(x, probs): - #return np.percentile(x, 100 * np.asarray(probs)) + # return np.percentile(x, 100 * np.asarray(probs)) probs = np.asarray(probs) - quantiles = np.asarray([np.percentile(x, 100 * prob) - for prob in probs.ravel(order="C")]) + quantiles = np.asarray( + [np.percentile(x, 100 * prob) for prob in probs.ravel(order="C")] + ) return quantiles.reshape(probs.shape, order="C") + def test__R_compat_quantile(): def t(x, prob, expected): assert np.allclose(_R_compat_quantile(x, prob), expected) + t([10, 20], 0.5, 15) t([10, 20], 0.3, 13) t([10, 20], [0.3, 0.7], [13, 17]) t(list(range(10)), [0.3, 0.7], [2.7, 6.3]) + class BS(object): """bs(x, df=None, knots=None, degree=3, include_intercept=False, lower_bound=None, upper_bound=None) @@ -133,29 +141,37 @@ class BS(object): .. versionadded:: 0.2.0 """ + def __init__(self): self._tmp = {} self._degree = None self._all_knots = None - def memorize_chunk(self, x, df=None, knots=None, degree=3, - include_intercept=False, - lower_bound=None, upper_bound=None): - args = {"df": df, - "knots": knots, - "degree": degree, - "include_intercept": include_intercept, - "lower_bound": lower_bound, - "upper_bound": upper_bound, - } + def memorize_chunk( + self, + x, + df=None, + knots=None, + degree=3, + include_intercept=False, + lower_bound=None, + upper_bound=None, + ): + args = { + "df": df, + "knots": knots, + "degree": degree, + "include_intercept": include_intercept, + "lower_bound": lower_bound, + "upper_bound": upper_bound, + } self._tmp["args"] = args # XX: check whether we need x values before saving them x = np.atleast_1d(x) if x.ndim == 2 and x.shape[1] == 1: x = x[:, 0] if x.ndim > 1: - raise ValueError("input to 'bs' must be 1-d, " - "or a 2-d column vector") + raise ValueError("input to 'bs' must be 1-d, " "or a 2-d column vector") # There's no better way to compute exact quantiles than memorizing # all data. self._tmp.setdefault("xs", []).append(x) @@ -166,11 +182,11 @@ def memorize_finish(self): del self._tmp if args["degree"] < 0: - raise ValueError("degree must be greater than 0 (not %r)" - % (args["degree"],)) + raise ValueError( + "degree must be greater than 0 (not %r)" % (args["degree"],) + ) if int(args["degree"]) != args["degree"]: - raise ValueError("degree must be an integer (not %r)" - % (self._degree,)) + raise ValueError("degree must be an integer (not %r)" % (self._degree,)) # These are guaranteed to all be 1d vectors by the code above x = np.concatenate(tmp["xs"]) @@ -182,20 +198,31 @@ def memorize_finish(self): if not args["include_intercept"]: n_inner_knots += 1 if n_inner_knots < 0: - raise ValueError("df=%r is too small for degree=%r and " - "include_intercept=%r; must be >= %s" - % (args["df"], args["degree"], - args["include_intercept"], - # We know that n_inner_knots is negative; - # if df were that much larger, it would - # have been zero, and things would work. - args["df"] - n_inner_knots)) + raise ValueError( + "df=%r is too small for degree=%r and " + "include_intercept=%r; must be >= %s" + % ( + args["df"], + args["degree"], + args["include_intercept"], + # We know that n_inner_knots is negative; + # if df were that much larger, it would + # have been zero, and things would work. + args["df"] - n_inner_knots, + ) + ) if args["knots"] is not None: if len(args["knots"]) != n_inner_knots: - raise ValueError("df=%s with degree=%r implies %s knots, " - "but %s knots were provided" - % (args["df"], args["degree"], - n_inner_knots, len(args["knots"]))) + raise ValueError( + "df=%s with degree=%r implies %s knots, " + "but %s knots were provided" + % ( + args["df"], + args["degree"], + n_inner_knots, + len(args["knots"]), + ) + ) else: # Need to compute inner knots knot_quantiles = np.linspace(0, 1, n_inner_knots + 2)[1:-1] @@ -211,31 +238,38 @@ def memorize_finish(self): else: upper_bound = np.max(x) if lower_bound > upper_bound: - raise ValueError("lower_bound > upper_bound (%r > %r)" - % (lower_bound, upper_bound)) + raise ValueError( + "lower_bound > upper_bound (%r > %r)" % (lower_bound, upper_bound) + ) inner_knots = np.asarray(inner_knots) if inner_knots.ndim > 1: raise ValueError("knots must be 1 dimensional") if np.any(inner_knots < lower_bound): - raise ValueError("some knot values (%s) fall below lower bound " - "(%r)" - % (inner_knots[inner_knots < lower_bound], - lower_bound)) + raise ValueError( + "some knot values (%s) fall below lower bound " + "(%r)" % (inner_knots[inner_knots < lower_bound], lower_bound) + ) if np.any(inner_knots > upper_bound): - raise ValueError("some knot values (%s) fall above upper bound " - "(%r)" - % (inner_knots[inner_knots > upper_bound], - upper_bound)) - all_knots = np.concatenate(([lower_bound, upper_bound] * order, - inner_knots)) + raise ValueError( + "some knot values (%s) fall above upper bound " + "(%r)" % (inner_knots[inner_knots > upper_bound], upper_bound) + ) + all_knots = np.concatenate(([lower_bound, upper_bound] * order, inner_knots)) all_knots.sort() self._degree = args["degree"] self._all_knots = all_knots - def transform(self, x, df=None, knots=None, degree=3, - include_intercept=False, - lower_bound=None, upper_bound=None): + def transform( + self, + x, + df=None, + knots=None, + degree=3, + include_intercept=False, + lower_bound=None, + upper_bound=None, + ): basis = _eval_bspline_basis(x, self._all_knots, self._degree) if not include_intercept: basis = basis[:, 1:] @@ -247,13 +281,14 @@ def transform(self, x, df=None, knots=None, degree=3, __getstate__ = no_pickling + bs = stateful_transform(BS) + def test_bs_compat(): from patsy.test_state import check_stateful - from patsy.test_splines_bs_data import (R_bs_test_x, - R_bs_test_data, - R_bs_num_tests) + from patsy.test_splines_bs_data import R_bs_test_x, R_bs_test_data, R_bs_num_tests + lines = R_bs_test_data.split("\n") tests_ran = 0 start_idx = lines.index("--BEGIN TEST CASE--") @@ -274,12 +309,12 @@ def test_bs_compat(): "df": eval(test_data["df"]), # np.array() call, or None "knots": eval(test_data["knots"]), - } + } if test_data["Boundary.knots"] != "None": lower, upper = eval(test_data["Boundary.knots"]) kwargs["lower_bound"] = lower kwargs["upper_bound"] = upper - kwargs["include_intercept"] = (test_data["intercept"] == "TRUE") + kwargs["include_intercept"] = test_data["intercept"] == "TRUE" # Special case: in R, setting intercept=TRUE increases the effective # dof by 1. Adjust our arguments to match. # if kwargs["df"] is not None and kwargs["include_intercept"]: @@ -294,8 +329,10 @@ def test_bs_compat(): start_idx = stop_idx + 1 assert tests_ran == R_bs_num_tests + test_bs_compat.slow = 1 + # This isn't checked by the above, because R doesn't have zero degree # b-splines. def test_bs_0degree(): @@ -315,18 +352,19 @@ def test_bs_0degree(): # get included into the larger region, not the smaller. This is consistent # with Python's half-open interval convention -- each basis function is # constant on [knot[i], knot[i + 1]). - assert np.array_equal(bs([0, 1, 2], degree=0, knots=[1], - include_intercept=True), - [[1, 0], - [0, 1], - [0, 1]]) + assert np.array_equal( + bs([0, 1, 2], degree=0, knots=[1], include_intercept=True), + [[1, 0], [0, 1], [0, 1]], + ) result_int = bs(x, knots=[1, 4], degree=0, include_intercept=True) result_no_int = bs(x, knots=[1, 4], degree=0, include_intercept=False) assert np.array_equal(result_int[:, 1:], result_no_int) + def test_bs_errors(): import pytest + x = np.linspace(-10, 10, 20) # error checks: # out of bounds @@ -341,59 +379,43 @@ def test_bs_errors(): bs(x, df=10, include_intercept=False, knots=[0] * 9, degree=1) bs(x, df=10, include_intercept=True, knots=[0] * 8, degree=1) # too many knots: - pytest.raises(ValueError, - bs, x, df=10, include_intercept=False, knots=[0] * 8) - pytest.raises(ValueError, - bs, x, df=10, include_intercept=True, knots=[0] * 7) - pytest.raises(ValueError, - bs, x, df=10, include_intercept=False, knots=[0] * 10, - degree=1) - pytest.raises(ValueError, - bs, x, df=10, include_intercept=True, knots=[0] * 9, - degree=1) + pytest.raises(ValueError, bs, x, df=10, include_intercept=False, knots=[0] * 8) + pytest.raises(ValueError, bs, x, df=10, include_intercept=True, knots=[0] * 7) + pytest.raises( + ValueError, bs, x, df=10, include_intercept=False, knots=[0] * 10, degree=1 + ) + pytest.raises( + ValueError, bs, x, df=10, include_intercept=True, knots=[0] * 9, degree=1 + ) # too few knots: - pytest.raises(ValueError, - bs, x, df=10, include_intercept=False, knots=[0] * 6) - pytest.raises(ValueError, - bs, x, df=10, include_intercept=True, knots=[0] * 5) - pytest.raises(ValueError, - bs, x, df=10, include_intercept=False, knots=[0] * 8, - degree=1) - pytest.raises(ValueError, - bs, x, df=10, include_intercept=True, knots=[0] * 7, - degree=1) + pytest.raises(ValueError, bs, x, df=10, include_intercept=False, knots=[0] * 6) + pytest.raises(ValueError, bs, x, df=10, include_intercept=True, knots=[0] * 5) + pytest.raises( + ValueError, bs, x, df=10, include_intercept=False, knots=[0] * 8, degree=1 + ) + pytest.raises( + ValueError, bs, x, df=10, include_intercept=True, knots=[0] * 7, degree=1 + ) # df too small - pytest.raises(ValueError, - bs, x, df=1, degree=3) - pytest.raises(ValueError, - bs, x, df=3, degree=5) + pytest.raises(ValueError, bs, x, df=1, degree=3) + pytest.raises(ValueError, bs, x, df=3, degree=5) # bad degree - pytest.raises(ValueError, - bs, x, df=10, degree=-1) - pytest.raises(ValueError, - bs, x, df=10, degree=1.5) + pytest.raises(ValueError, bs, x, df=10, degree=-1) + pytest.raises(ValueError, bs, x, df=10, degree=1.5) # upper_bound < lower_bound - pytest.raises(ValueError, - bs, x, 3, lower_bound=1, upper_bound=-1) + pytest.raises(ValueError, bs, x, 3, lower_bound=1, upper_bound=-1) # multidimensional input - pytest.raises(ValueError, - bs, np.column_stack((x, x)), 3) + pytest.raises(ValueError, bs, np.column_stack((x, x)), 3) # unsorted knots are okay, and get sorted assert np.array_equal(bs(x, knots=[1, 4]), bs(x, knots=[4, 1])) # 2d knots - pytest.raises(ValueError, - bs, x, knots=[[0], [20]]) + pytest.raises(ValueError, bs, x, knots=[[0], [20]]) # knots > upper_bound - pytest.raises(ValueError, - bs, x, knots=[0, 20]) - pytest.raises(ValueError, - bs, x, knots=[0, 4], upper_bound=3) + pytest.raises(ValueError, bs, x, knots=[0, 20]) + pytest.raises(ValueError, bs, x, knots=[0, 4], upper_bound=3) # knots < lower_bound - pytest.raises(ValueError, - bs, x, knots=[-20, 0]) - pytest.raises(ValueError, - bs, x, knots=[-4, 0], lower_bound=-3) - + pytest.raises(ValueError, bs, x, knots=[-20, 0]) + pytest.raises(ValueError, bs, x, knots=[-4, 0], lower_bound=-3) # differences between bs and ns (since the R code is a pile of copy-paste): diff --git a/patsy/state.py b/patsy/state.py index 933c588..8d674ba 100644 --- a/patsy/state.py +++ b/patsy/state.py @@ -26,29 +26,41 @@ from functools import wraps import numpy as np -from patsy.util import (atleast_2d_column_default, - asarray_or_pandas, pandas_friendly_reshape, - wide_dtype_for, safe_issubdtype, - no_pickling, assert_no_pickling) +from patsy.util import ( + atleast_2d_column_default, + asarray_or_pandas, + pandas_friendly_reshape, + wide_dtype_for, + safe_issubdtype, + no_pickling, + assert_no_pickling, +) # These are made available in the patsy.* namespace -__all__ = ["stateful_transform", - "center", "standardize", "scale", - ] +__all__ = [ + "stateful_transform", + "center", + "standardize", + "scale", +] + def stateful_transform(class_): """Create a stateful transform callable object from a class that fulfills the :ref:`stateful transform protocol `. """ + @wraps(class_) def stateful_transform_wrapper(*args, **kwargs): transform = class_() transform.memorize_chunk(*args, **kwargs) transform.memorize_finish() return transform.transform(*args, **kwargs) + stateful_transform_wrapper.__patsy_stateful_transform__ = class_ return stateful_transform_wrapper + # class NonIncrementalStatefulTransform(object): # def __init__(self): # self._data = [] @@ -76,6 +88,7 @@ def stateful_transform_wrapper(*args, **kwargs): # class QuantileEstimatingTransform(NonIncrementalStatefulTransform): # def memorize_all(self, input_data, *args, **kwargs): + class Center(object): """center(x) @@ -85,6 +98,7 @@ class Center(object): Equivalent to ``standardize(x, rescale=False)`` """ + def __init__(self): self._sum = None self._count = 0 @@ -118,8 +132,10 @@ def transform(self, x): __getstate__ = no_pickling + center = stateful_transform(Center) + # See: # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm # or page 232 of Knuth vol. 3 (3rd ed.). @@ -141,6 +157,7 @@ class Standardize(object): memory-efficient online algorithm, making it suitable for use with large incrementally processed data-sets. """ + def __init__(self): self.current_n = 0 self.current_mean = None @@ -176,6 +193,7 @@ def transform(self, x, center=True, rescale=True, ddof=0): __getstate__ = no_pickling + standardize = stateful_transform(Standardize) # R compatibility: scale = standardize diff --git a/patsy/test_build.py b/patsy/test_build.py index 4b112ef..bad3be6 100644 --- a/patsy/test_build.py +++ b/patsy/test_build.py @@ -10,8 +10,7 @@ import numpy as np import pytest from patsy import PatsyError -from patsy.util import (atleast_2d_column_default, - have_pandas, have_pandas_categorical) +from patsy.util import atleast_2d_column_default, have_pandas, have_pandas_categorical from patsy.desc import Term, INTERCEPT from patsy.build import build_design_matrices, design_matrix_builders from patsy.categorical import C @@ -21,6 +20,7 @@ if have_pandas: import pandas + def assert_full_rank(m): m = atleast_2d_column_default(m) if m.shape[1] == 0: @@ -29,18 +29,16 @@ def assert_full_rank(m): rank = np.sum(s > 1e-10) assert rank == m.shape[1] + def test_assert_full_rank(): assert_full_rank(np.eye(10)) assert_full_rank([[1, 0], [1, 0], [1, 0], [1, 1]]) - pytest.raises(AssertionError, - assert_full_rank, [[1, 0], [2, 0]]) - pytest.raises(AssertionError, - assert_full_rank, [[1, 2], [2, 4]]) - pytest.raises(AssertionError, - assert_full_rank, [[1, 2, 3], [1, 10, 100]]) + pytest.raises(AssertionError, assert_full_rank, [[1, 0], [2, 0]]) + pytest.raises(AssertionError, assert_full_rank, [[1, 2], [2, 4]]) + pytest.raises(AssertionError, assert_full_rank, [[1, 2, 3], [1, 10, 100]]) # col1 + col2 = col3 - pytest.raises(AssertionError, - assert_full_rank, [[1, 2, 3], [1, 5, 6], [1, 6, 7]]) + pytest.raises(AssertionError, assert_full_rank, [[1, 2, 3], [1, 5, 6], [1, 6, 7]]) + def make_termlist(*entries): terms = [] @@ -48,6 +46,7 @@ def make_termlist(*entries): terms.append(Term([LookupFactor(name) for name in entry])) return terms + def check_design_matrix(mm, expected_rank, termlist, column_names=None): assert_full_rank(mm) assert set(mm.design_info.terms) == set(termlist) @@ -56,22 +55,23 @@ def check_design_matrix(mm, expected_rank, termlist, column_names=None): assert mm.ndim == 2 assert mm.shape[1] == expected_rank + def make_matrix(data, expected_rank, entries, column_names=None): termlist = make_termlist(*entries) + def iter_maker(): yield data + design_infos = design_matrix_builders([termlist], iter_maker, eval_env=0) matrices = build_design_matrices(design_infos, data) matrix = matrices[0] - assert (design_infos[0].term_slices - == matrix.design_info.term_slices) - assert (design_infos[0].column_names - == matrix.design_info.column_names) + assert design_infos[0].term_slices == matrix.design_info.term_slices + assert design_infos[0].column_names == matrix.design_info.column_names assert matrix.design_info is design_infos[0] - check_design_matrix(matrix, expected_rank, termlist, - column_names=column_names) + check_design_matrix(matrix, expected_rank, termlist, column_names=column_names) return matrix + def test_simple(): data = balanced(a=2, b=2) x1 = data["x1"] = np.linspace(0, 1, len(data["a"])) @@ -83,41 +83,52 @@ def test_simple(): m = make_matrix(data, 2, [[], ["a"]], column_names=["Intercept", "a[T.a2]"]) assert np.allclose(m, [[1, 0], [1, 0], [1, 1], [1, 1]]) - m = make_matrix(data, 4, [["a", "b"]], - column_names=["a[a1]:b[b1]", "a[a2]:b[b1]", - "a[a1]:b[b2]", "a[a2]:b[b2]"]) - assert np.allclose(m, [[1, 0, 0, 0], - [0, 0, 1, 0], - [0, 1, 0, 0], - [0, 0, 0, 1]]) - - m = make_matrix(data, 4, [[], ["a"], ["b"], ["a", "b"]], - column_names=["Intercept", "a[T.a2]", - "b[T.b2]", "a[T.a2]:b[T.b2]"]) - assert np.allclose(m, [[1, 0, 0, 0], - [1, 0, 1, 0], - [1, 1, 0, 0], - [1, 1, 1, 1]]) - - m = make_matrix(data, 4, [[], ["b"], ["a"], ["b", "a"]], - column_names=["Intercept", "b[T.b2]", - "a[T.a2]", "b[T.b2]:a[T.a2]"]) - assert np.allclose(m, [[1, 0, 0, 0], - [1, 1, 0, 0], - [1, 0, 1, 0], - [1, 1, 1, 1]]) - - m = make_matrix(data, 4, [["a"], ["x1"], ["a", "x1"]], - column_names=["a[a1]", "a[a2]", "x1", "a[T.a2]:x1"]) - assert np.allclose(m, [[1, 0, x1[0], 0], - [1, 0, x1[1], 0], - [0, 1, x1[2], x1[2]], - [0, 1, x1[3], x1[3]]]) - - m = make_matrix(data, 3, [["x1"], ["x2"], ["x2", "x1"]], - column_names=["x1", "x2", "x2:x1"]) + m = make_matrix( + data, + 4, + [["a", "b"]], + column_names=["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"], + ) + assert np.allclose(m, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) + + m = make_matrix( + data, + 4, + [[], ["a"], ["b"], ["a", "b"]], + column_names=["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"], + ) + assert np.allclose(m, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]]) + + m = make_matrix( + data, + 4, + [[], ["b"], ["a"], ["b", "a"]], + column_names=["Intercept", "b[T.b2]", "a[T.a2]", "b[T.b2]:a[T.a2]"], + ) + assert np.allclose(m, [[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]]) + + m = make_matrix( + data, + 4, + [["a"], ["x1"], ["a", "x1"]], + column_names=["a[a1]", "a[a2]", "x1", "a[T.a2]:x1"], + ) + assert np.allclose( + m, + [ + [1, 0, x1[0], 0], + [1, 0, x1[1], 0], + [0, 1, x1[2], x1[2]], + [0, 1, x1[3], x1[3]], + ], + ) + + m = make_matrix( + data, 3, [["x1"], ["x2"], ["x2", "x1"]], column_names=["x1", "x2", "x2:x1"] + ) assert np.allclose(m, np.column_stack((x1, x2, x1 * x2))) + def test_R_bugs(): data = balanced(a=2, b=2, c=2) data["x"] = np.linspace(0, 1, len(data["a"])) @@ -136,6 +147,7 @@ def test_R_bugs(): # does get this one right, but we might as well test it.) make_matrix(data, 6, [["a", "c"], ["a", "b"]]) + def test_redundancy_thoroughly(): # To make sure there aren't any lurking bugs analogous to the ones that R # has (see above), we check that we get the correct matrix rank for every @@ -157,13 +169,16 @@ def all_subsets(l): all_termlist_templates = list(all_subsets(all_terms)) print(len(all_termlist_templates)) # eliminate some of the symmetric versions to speed things up - redundant = [[("b",), ("a",)], - [("x2",), ("x1",)], - [("b", "x2"), ("a", "x1")], - [("a", "b", "x2"), ("a", "b", "x1")], - [("b", "x1", "x2"), ("a", "x1", "x2")]] + redundant = [ + [("b",), ("a",)], + [("x2",), ("x1",)], + [("b", "x2"), ("a", "x1")], + [("a", "b", "x2"), ("a", "b", "x1")], + [("b", "x1", "x2"), ("a", "x1", "x2")], + ] count = 0 import time + start = time.time() for termlist_template in all_termlist_templates: termlist_set = set(termlist_template) @@ -182,9 +197,9 @@ def all_subsets(l): expected_rank = len(expanded_terms) if termlist_template in [(), ((),)]: # No data dependence, should fail - pytest.raises(PatsyError, - make_matrix, - data, expected_rank, termlist_template) + pytest.raises( + PatsyError, make_matrix, data, expected_rank, termlist_template + ) else: make_matrix(data, expected_rank, termlist_template) count += 1 @@ -192,47 +207,56 @@ def all_subsets(l): print("Completed:", count) print("Took %0.2f seconds" % (time.time() - start,)) + test_redundancy_thoroughly.slow = 1 + def test_data_types(): - basic_dict = {"a": ["a1", "a2", "a1", "a2"], - "x": [1, 2, 3, 4]} + basic_dict = {"a": ["a1", "a2", "a1", "a2"], "x": [1, 2, 3, 4]} # On Python 2, this is identical to basic_dict: basic_dict_bytes = dict(basic_dict) basic_dict_bytes["a"] = [s.encode("ascii") for s in basic_dict_bytes["a"]] # On Python 3, this is identical to basic_dict: - basic_dict_unicode = {"a": ["a1", "a2", "a1", "a2"], - "x": [1, 2, 3, 4]} + basic_dict_unicode = {"a": ["a1", "a2", "a1", "a2"], "x": [1, 2, 3, 4]} basic_dict_unicode = dict(basic_dict) basic_dict_unicode["a"] = [str(s) for s in basic_dict_unicode["a"]] - structured_array_bytes = np.array(list(zip(basic_dict["a"], - basic_dict["x"])), - dtype=[("a", "S2"), ("x", int)]) - structured_array_unicode = np.array(list(zip(basic_dict["a"], - basic_dict["x"])), - dtype=[("a", "U2"), ("x", int)]) + structured_array_bytes = np.array( + list(zip(basic_dict["a"], basic_dict["x"])), dtype=[("a", "S2"), ("x", int)] + ) + structured_array_unicode = np.array( + list(zip(basic_dict["a"], basic_dict["x"])), dtype=[("a", "U2"), ("x", int)] + ) recarray_bytes = structured_array_bytes.view(np.recarray) recarray_unicode = structured_array_unicode.view(np.recarray) - datas = [basic_dict, structured_array_bytes, structured_array_unicode, - recarray_bytes, recarray_unicode] + datas = [ + basic_dict, + structured_array_bytes, + structured_array_unicode, + recarray_bytes, + recarray_unicode, + ] if have_pandas: df_bytes = pandas.DataFrame(basic_dict_bytes) datas.append(df_bytes) df_unicode = pandas.DataFrame(basic_dict_unicode) datas.append(df_unicode) for data in datas: - m = make_matrix(data, 4, [["a"], ["a", "x"]], - column_names=["a[a1]", "a[a2]", "a[a1]:x", "a[a2]:x"]) - assert np.allclose(m, [[1, 0, 1, 0], - [0, 1, 0, 2], - [1, 0, 3, 0], - [0, 1, 0, 4]]) + m = make_matrix( + data, + 4, + [["a"], ["a", "x"]], + column_names=["a[a1]", "a[a2]", "a[a1]:x", "a[a2]:x"], + ) + assert np.allclose(m, [[1, 0, 1, 0], [0, 1, 0, 2], [1, 0, 3, 0], [0, 1, 0, 4]]) + def test_build_design_matrices_dtype(): data = {"x": [1, 2, 3]} + def iter_maker(): yield data + builder = design_matrix_builders([make_termlist("x")], iter_maker, 0)[0] mat = build_design_matrices([builder], data)[0] @@ -245,10 +269,13 @@ def iter_maker(): mat = build_design_matrices([builder], data, dtype=np.float128)[0] assert mat.dtype == np.dtype(np.float128) + def test_return_type(): data = {"x": [1, 2, 3]} + def iter_maker(): yield data + builder = design_matrix_builders([make_termlist("x")], iter_maker, 0)[0] # Check explicitly passing return_type="matrix" works @@ -256,101 +283,114 @@ def iter_maker(): assert isinstance(mat, DesignMatrix) # Check that nonsense is detected - pytest.raises(PatsyError, - build_design_matrices, [builder], data, - return_type="asdfsadf") + pytest.raises( + PatsyError, build_design_matrices, [builder], data, return_type="asdfsadf" + ) + def test_NA_action(): initial_data = {"x": [1, 2, 3], "c": ["c1", "c2", "c1"]} + def iter_maker(): yield initial_data + builder = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0)[0] # By default drops rows containing either NaN or None - mat = build_design_matrices([builder], - {"x": [10.0, np.nan, 20.0], - "c": np.asarray(["c1", "c2", None], - dtype=object)})[0] + mat = build_design_matrices( + [builder], + {"x": [10.0, np.nan, 20.0], "c": np.asarray(["c1", "c2", None], dtype=object)}, + )[0] assert mat.shape == (1, 3) assert np.array_equal(mat, [[1.0, 0.0, 10.0]]) # NA_action="a string" also accepted: - mat = build_design_matrices([builder], - {"x": [10.0, np.nan, 20.0], - "c": np.asarray(["c1", "c2", None], - dtype=object)}, - NA_action="drop")[0] + mat = build_design_matrices( + [builder], + {"x": [10.0, np.nan, 20.0], "c": np.asarray(["c1", "c2", None], dtype=object)}, + NA_action="drop", + )[0] assert mat.shape == (1, 3) assert np.array_equal(mat, [[1.0, 0.0, 10.0]]) # And objects from patsy.missing import NAAction + # allows NaN's to pass through NA_action = NAAction(NA_types=[]) - mat = build_design_matrices([builder], - {"x": [10.0, np.nan], - "c": np.asarray(["c1", "c2"], - dtype=object)}, - NA_action=NA_action)[0] + mat = build_design_matrices( + [builder], + {"x": [10.0, np.nan], "c": np.asarray(["c1", "c2"], dtype=object)}, + NA_action=NA_action, + )[0] assert mat.shape == (2, 3) # According to this (and only this) function, NaN == NaN. np.testing.assert_array_equal(mat, [[1.0, 0.0, 10.0], [0.0, 1.0, np.nan]]) # NA_action="raise" - pytest.raises(PatsyError, - build_design_matrices, - [builder], - {"x": [10.0, np.nan, 20.0], - "c": np.asarray(["c1", "c2", None], - dtype=object)}, - NA_action="raise") + pytest.raises( + PatsyError, + build_design_matrices, + [builder], + {"x": [10.0, np.nan, 20.0], "c": np.asarray(["c1", "c2", None], dtype=object)}, + NA_action="raise", + ) + def test_NA_drop_preserves_levels(): # Even if all instances of some level are dropped, we still include it in # the output matrix (as an all-zeros column) data = {"x": [1.0, np.nan, 3.0], "c": ["c1", "c2", "c3"]} + def iter_maker(): yield data + design_info = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0)[0] assert design_info.column_names == ["c[c1]", "c[c2]", "c[c3]", "x"] - mat, = build_design_matrices([design_info], data) + (mat,) = build_design_matrices([design_info], data) assert mat.shape == (2, 4) - assert np.array_equal(mat, [[1.0, 0.0, 0.0, 1.0], - [0.0, 0.0, 1.0, 3.0]]) + assert np.array_equal(mat, [[1.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 3.0]]) + def test_return_type_pandas(): if not have_pandas: return - data = pandas.DataFrame({"x": [1, 2, 3], - "y": [4, 5, 6], - "a": ["a1", "a2", "a1"]}, - index=[10, 20, 30]) + data = pandas.DataFrame( + {"x": [1, 2, 3], "y": [4, 5, 6], "a": ["a1", "a2", "a1"]}, index=[10, 20, 30] + ) + def iter_maker(): yield data - int_builder, = design_matrix_builders([make_termlist([])], iter_maker, 0) - (y_builder, x_builder) = design_matrix_builders([make_termlist("y"), - make_termlist("x")], - iter_maker, - eval_env=0) - (x_a_builder,) = design_matrix_builders([make_termlist("x", "a")], - iter_maker, - eval_env=0) - (x_y_builder,) = design_matrix_builders([make_termlist("x", "y")], - iter_maker, - eval_env=0) + + (int_builder,) = design_matrix_builders([make_termlist([])], iter_maker, 0) + (y_builder, x_builder) = design_matrix_builders( + [make_termlist("y"), make_termlist("x")], iter_maker, eval_env=0 + ) + (x_a_builder,) = design_matrix_builders( + [make_termlist("x", "a")], iter_maker, eval_env=0 + ) + (x_y_builder,) = design_matrix_builders( + [make_termlist("x", "y")], iter_maker, eval_env=0 + ) # Index compatibility is always checked for pandas input, regardless of # whether we're producing pandas output - pytest.raises(PatsyError, - build_design_matrices, - [x_a_builder], {"x": data["x"], "a": data["a"][::-1]}) - pytest.raises(PatsyError, - build_design_matrices, - [y_builder, x_builder], - {"x": data["x"], "y": data["y"][::-1]}) + pytest.raises( + PatsyError, + build_design_matrices, + [x_a_builder], + {"x": data["x"], "a": data["a"][::-1]}, + ) + pytest.raises( + PatsyError, + build_design_matrices, + [y_builder, x_builder], + {"x": data["x"], "y": data["y"][::-1]}, + ) + # And we also check consistency between data.index and value indexes # Creating a mismatch between these is a bit tricky. We want a data object # such that isinstance(data, DataFrame), but data["x"].index != @@ -361,20 +401,20 @@ def __getitem__(self, key): return pandas.DataFrame.__getitem__(self, key)[::-1] else: return pandas.DataFrame.__getitem__(self, key) - pytest.raises(PatsyError, - build_design_matrices, - [x_builder], - CheatingDataFrame(data)) + + pytest.raises( + PatsyError, build_design_matrices, [x_builder], CheatingDataFrame(data) + ) # A mix of pandas input and unindexed input is fine - (mat,) = build_design_matrices([x_y_builder], - {"x": data["x"], "y": [40, 50, 60]}) + (mat,) = build_design_matrices([x_y_builder], {"x": data["x"], "y": [40, 50, 60]}) assert np.allclose(mat, [[1, 40], [2, 50], [3, 60]]) # with return_type="dataframe", we get out DataFrames with nice indices # and nice column names and design_info - y_df, x_df = build_design_matrices([y_builder, x_builder], data, - return_type="dataframe") + y_df, x_df = build_design_matrices( + [y_builder, x_builder], data, return_type="dataframe" + ) assert isinstance(y_df, pandas.DataFrame) assert isinstance(x_df, pandas.DataFrame) assert np.array_equal(y_df, [[4], [5], [6]]) @@ -389,9 +429,11 @@ def __getitem__(self, key): assert x_df.design_info.term_names == ["x"] # Same with mix of pandas and unindexed info, even if in different # matrices - y_df, x_df = build_design_matrices([y_builder, x_builder], - {"y": [7, 8, 9], "x": data["x"]}, - return_type="dataframe") + y_df, x_df = build_design_matrices( + [y_builder, x_builder], + {"y": [7, 8, 9], "x": data["x"]}, + return_type="dataframe", + ) assert isinstance(y_df, pandas.DataFrame) assert isinstance(x_df, pandas.DataFrame) assert np.array_equal(y_df, [[7], [8], [9]]) @@ -405,75 +447,84 @@ def __getitem__(self, key): assert y_df.design_info.term_names == ["y"] assert x_df.design_info.term_names == ["x"] # Check categorical works for carrying index too - (x_a_df,) = build_design_matrices([x_a_builder], - {"x": [-1, -2, -3], "a": data["a"]}, - return_type="dataframe") + (x_a_df,) = build_design_matrices( + [x_a_builder], {"x": [-1, -2, -3], "a": data["a"]}, return_type="dataframe" + ) assert isinstance(x_a_df, pandas.DataFrame) assert np.array_equal(x_a_df, [[1, 0, -1], [0, 1, -2], [1, 0, -3]]) assert np.array_equal(x_a_df.index, [10, 20, 30]) # And if we have no indexed input, then we let pandas make up an index as # per its usual rules: - (x_y_df,) = build_design_matrices([x_y_builder], - {"y": [7, 8, 9], "x": [10, 11, 12]}, - return_type="dataframe") + (x_y_df,) = build_design_matrices( + [x_y_builder], {"y": [7, 8, 9], "x": [10, 11, 12]}, return_type="dataframe" + ) assert isinstance(x_y_df, pandas.DataFrame) assert np.array_equal(x_y_df, [[10, 7], [11, 8], [12, 9]]) assert np.array_equal(x_y_df.index, [0, 1, 2]) # If 'data' is a DataFrame, then that suffices, even if no factors are # available. - (int_df,) = build_design_matrices([int_builder], data, - return_type="dataframe") + (int_df,) = build_design_matrices([int_builder], data, return_type="dataframe") assert isinstance(int_df, pandas.DataFrame) assert np.array_equal(int_df, [[1], [1], [1]]) assert int_df.index.equals(pandas.Index([10, 20, 30])) import patsy.build + had_pandas = patsy.build.have_pandas try: patsy.build.have_pandas = False # return_type="dataframe" gives a nice error if pandas is not available - pytest.raises(PatsyError, - build_design_matrices, - [x_builder], {"x": [1, 2, 3]}, return_type="dataframe") + pytest.raises( + PatsyError, + build_design_matrices, + [x_builder], + {"x": [1, 2, 3]}, + return_type="dataframe", + ) finally: patsy.build.have_pandas = had_pandas - x_df, = build_design_matrices([x_a_builder], - {"x": [1.0, np.nan, 3.0], - "a": np.asarray([None, "a2", "a1"], - dtype=object)}, - NA_action="drop", - return_type="dataframe") + (x_df,) = build_design_matrices( + [x_a_builder], + {"x": [1.0, np.nan, 3.0], "a": np.asarray([None, "a2", "a1"], dtype=object)}, + NA_action="drop", + return_type="dataframe", + ) assert x_df.index.equals(pandas.Index([2])) + def test_data_mismatch(): test_cases_twoway = [ # Data type mismatch ([1, 2, 3], [True, False, True]), - (C(["a", "b", "c"], levels=["c", "b", "a"]), - C(["a", "b", "c"], levels=["a", "b", "c"])), + ( + C(["a", "b", "c"], levels=["c", "b", "a"]), + C(["a", "b", "c"], levels=["a", "b", "c"]), + ), # column number mismatches ([[1], [2], [3]], [[1, 1], [2, 2], [3, 3]]), ([[1, 1, 1], [2, 2, 2], [3, 3, 3]], [[1, 1], [2, 2], [3, 3]]), - ] + ] test_cases_oneway = [ ([1, 2, 3], ["a", "b", "c"]), ([1, 2, 3], C(["a", "b", "c"])), ([True, False, True], C(["a", "b", "c"])), ([True, False, True], ["a", "b", "c"]), - ] + ] setup_predict_only = [ # This is not an error if both are fed in during make_builders, but it # is an error to pass one to make_builders and the other to # make_matrices. (["a", "b", "c"], ["a", "b", "d"]), - ] + ] termlist = make_termlist(["x"]) + def t_incremental(data1, data2): def iter_maker(): yield {"x": data1} yield {"x": data2} + try: builders = design_matrix_builders([termlist], iter_maker, 0) build_design_matrices(builders, {"x": data1}) @@ -482,30 +533,34 @@ def iter_maker(): pass else: raise AssertionError + def t_setup_predict(data1, data2): def iter_maker(): yield {"x": data1} + builders = design_matrix_builders([termlist], iter_maker, 0) - pytest.raises(PatsyError, - build_design_matrices, builders, {"x": data2}) - for (a, b) in test_cases_twoway: + pytest.raises(PatsyError, build_design_matrices, builders, {"x": data2}) + + for a, b in test_cases_twoway: t_incremental(a, b) t_incremental(b, a) t_setup_predict(a, b) t_setup_predict(b, a) - for (a, b) in test_cases_oneway: + for a, b in test_cases_oneway: t_incremental(a, b) t_setup_predict(a, b) - for (a, b) in setup_predict_only: + for a, b in setup_predict_only: t_setup_predict(a, b) t_setup_predict(b, a) - pytest.raises(PatsyError, - make_matrix, {"x": [1, 2, 3], "y": [1, 2, 3, 4]}, - 2, [["x"], ["y"]]) + pytest.raises( + PatsyError, make_matrix, {"x": [1, 2, 3], "y": [1, 2, 3, 4]}, 2, [["x"], ["y"]] + ) + def test_data_independent_builder(): data = {"x": [1, 2, 3]} + def iter_maker(): yield data @@ -517,20 +572,20 @@ def iter_maker(): null_builder = design_matrix_builders([make_termlist()], iter_maker, 0)[0] pytest.raises(PatsyError, build_design_matrices, [null_builder], data) - intercept_builder = design_matrix_builders([make_termlist([])], - iter_maker, - eval_env=0)[0] + intercept_builder = design_matrix_builders( + [make_termlist([])], iter_maker, eval_env=0 + )[0] pytest.raises(PatsyError, build_design_matrices, [intercept_builder], data) - pytest.raises(PatsyError, - build_design_matrices, - [null_builder, intercept_builder], data) + pytest.raises( + PatsyError, build_design_matrices, [null_builder, intercept_builder], data + ) # If data is a DataFrame, it sets the number of rows. if have_pandas: - int_m, null_m = build_design_matrices([intercept_builder, - null_builder], - pandas.DataFrame(data)) + int_m, null_m = build_design_matrices( + [intercept_builder, null_builder], pandas.DataFrame(data) + ) assert np.allclose(int_m, [[1], [1], [1]]) assert null_m.shape == (3, 0) @@ -538,25 +593,28 @@ def iter_maker(): # data-independent matrices have the same number of rows. x_termlist = make_termlist(["x"]) - builders = design_matrix_builders([x_termlist, make_termlist()], - iter_maker, - eval_env=0) + builders = design_matrix_builders( + [x_termlist, make_termlist()], iter_maker, eval_env=0 + ) x_m, null_m = build_design_matrices(builders, data) assert np.allclose(x_m, [[1], [2], [3]]) assert null_m.shape == (3, 0) - builders = design_matrix_builders([x_termlist, make_termlist([])], - iter_maker, - eval_env=0) + builders = design_matrix_builders( + [x_termlist, make_termlist([])], iter_maker, eval_env=0 + ) x_m, null_m = build_design_matrices(builders, data) x_m, intercept_m = build_design_matrices(builders, data) assert np.allclose(x_m, [[1], [2], [3]]) assert np.allclose(intercept_m, [[1], [1], [1]]) + def test_same_factor_in_two_matrices(): data = {"x": [1, 2, 3], "a": ["a1", "a2", "a1"]} + def iter_maker(): yield data + t1 = make_termlist(["x"]) t2 = make_termlist(["x", "a"]) builders = design_matrix_builders([t1, t2], iter_maker, eval_env=0) @@ -566,11 +624,17 @@ def iter_maker(): check_design_matrix(m2, 2, t2, column_names=["x:a[a1]", "x:a[a2]"]) assert np.allclose(m2, [[1, 0], [0, 2], [3, 0]]) + def test_eval_env_type_builder(): data = {"x": [1, 2, 3]} + def iter_maker(): yield data - pytest.raises(TypeError, design_matrix_builders, [make_termlist("x")], iter_maker, "foo") + + pytest.raises( + TypeError, design_matrix_builders, [make_termlist("x")], iter_maker, "foo" + ) + def test_categorical(): data_strings = {"a": ["a1", "a2", "a1"]} @@ -579,107 +643,114 @@ def test_categorical(): if have_pandas_categorical: data_pandas = {"a": pandas.Categorical(["a1", "a2", "a2"])} datas.append(data_pandas) + def t(data1, data2): def iter_maker(): yield data1 - builders = design_matrix_builders([make_termlist(["a"])], - iter_maker, - eval_env=0) + + builders = design_matrix_builders( + [make_termlist(["a"])], iter_maker, eval_env=0 + ) build_design_matrices(builders, data2) + for data1 in datas: for data2 in datas: t(data1, data2) + def test_contrast(): from patsy.contrasts import ContrastMatrix, Sum + values = ["a1", "a3", "a1", "a2"] # No intercept in model, full-rank coding of 'a' - m = make_matrix({"a": C(values)}, 3, [["a"]], - column_names=["a[a1]", "a[a2]", "a[a3]"]) + m = make_matrix( + {"a": C(values)}, 3, [["a"]], column_names=["a[a1]", "a[a2]", "a[a3]"] + ) - assert np.allclose(m, [[1, 0, 0], - [0, 0, 1], - [1, 0, 0], - [0, 1, 0]]) + assert np.allclose(m, [[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) for s in (Sum, Sum()): - m = make_matrix({"a": C(values, s)}, 3, [["a"]], - column_names=["a[mean]", "a[S.a1]", "a[S.a2]"]) + m = make_matrix( + {"a": C(values, s)}, + 3, + [["a"]], + column_names=["a[mean]", "a[S.a1]", "a[S.a2]"], + ) # Output from R - assert np.allclose(m, [[1, 1, 0], - [1,-1, -1], - [1, 1, 0], - [1, 0, 1]]) - - m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [["a"]], - column_names=["a[mean]", "a[S.a2]", "a[S.a3]"]) + assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]]) + + m = make_matrix( + {"a": C(values, Sum(omit=0))}, + 3, + [["a"]], + column_names=["a[mean]", "a[S.a2]", "a[S.a3]"], + ) # Output from R - assert np.allclose(m, [[1, -1, -1], - [1, 0, 1], - [1, -1, -1], - [1, 1, 0]]) + assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]]) # Intercept in model, non-full-rank coding of 'a' - m = make_matrix({"a": C(values)}, 3, [[], ["a"]], - column_names=["Intercept", "a[T.a2]", "a[T.a3]"]) + m = make_matrix( + {"a": C(values)}, + 3, + [[], ["a"]], + column_names=["Intercept", "a[T.a2]", "a[T.a3]"], + ) - assert np.allclose(m, [[1, 0, 0], - [1, 0, 1], - [1, 0, 0], - [1, 1, 0]]) + assert np.allclose(m, [[1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0]]) for s in (Sum, Sum()): - m = make_matrix({"a": C(values, s)}, 3, [[], ["a"]], - column_names=["Intercept", "a[S.a1]", "a[S.a2]"]) + m = make_matrix( + {"a": C(values, s)}, + 3, + [[], ["a"]], + column_names=["Intercept", "a[S.a1]", "a[S.a2]"], + ) # Output from R - assert np.allclose(m, [[1, 1, 0], - [1,-1, -1], - [1, 1, 0], - [1, 0, 1]]) - - m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [[], ["a"]], - column_names=["Intercept", "a[S.a2]", "a[S.a3]"]) + assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]]) + + m = make_matrix( + {"a": C(values, Sum(omit=0))}, + 3, + [[], ["a"]], + column_names=["Intercept", "a[S.a2]", "a[S.a3]"], + ) # Output from R - assert np.allclose(m, [[1, -1, -1], - [1, 0, 1], - [1, -1, -1], - [1, 1, 0]]) + assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]]) # Weird ad hoc less-than-full-rank coding of 'a' - m = make_matrix({"a": C(values, [[7, 12], - [2, 13], - [8, -1]])}, - 2, [["a"]], - column_names=["a[custom0]", "a[custom1]"]) - assert np.allclose(m, [[7, 12], - [8, -1], - [7, 12], - [2, 13]]) - - m = make_matrix({"a": C(values, ContrastMatrix([[7, 12], - [2, 13], - [8, -1]], - ["[foo]", "[bar]"]))}, - 2, [["a"]], - column_names=["a[foo]", "a[bar]"]) - assert np.allclose(m, [[7, 12], - [8, -1], - [7, 12], - [2, 13]]) + m = make_matrix( + {"a": C(values, [[7, 12], [2, 13], [8, -1]])}, + 2, + [["a"]], + column_names=["a[custom0]", "a[custom1]"], + ) + assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]]) + + m = make_matrix( + { + "a": C( + values, ContrastMatrix([[7, 12], [2, 13], [8, -1]], ["[foo]", "[bar]"]) + ) + }, + 2, + [["a"]], + column_names=["a[foo]", "a[bar]"], + ) + assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]]) + def test_DesignInfo_subset(): # For each combination of: # formula, term names, term objects, mixed term name and term objects # check that results match subset of full build # and that removed variables don't hurt - all_data = {"x": [1, 2], - "y": [[3.1, 3.2], - [4.1, 4.2]], - "z": [5, 6]} + all_data = {"x": [1, 2], "y": [[3.1, 3.2], [4.1, 4.2]], "z": [5, 6]} all_terms = make_termlist("x", "y", "z") + def iter_maker(): yield all_data + all_builder = design_matrix_builders([all_terms], iter_maker, 0)[0] full_matrix = build_design_matrices([all_builder], all_data)[0] @@ -718,8 +789,7 @@ def t(which_terms, variables, columns): # Term must exist pytest.raises(KeyError, all_builder.subset, "~ asdf") pytest.raises(KeyError, all_builder.subset, ["asdf"]) - pytest.raises(KeyError, - all_builder.subset, [Term(["asdf"])]) + pytest.raises(KeyError, all_builder.subset, [Term(["asdf"])]) # Also check for a minimal DesignInfo (column names only) min_di = DesignInfo(["a", "b", "c"]) diff --git a/patsy/test_highlevel.py b/patsy/test_highlevel.py index 66c293a..35c86a1 100644 --- a/patsy/test_highlevel.py +++ b/patsy/test_highlevel.py @@ -14,22 +14,30 @@ from patsy.categorical import C from patsy.contrasts import Helmert from patsy.user_util import balanced, LookupFactor -from patsy.build import (design_matrix_builders, - build_design_matrices) -from patsy.highlevel import (dmatrix, dmatrices, - incr_dbuilder, incr_dbuilders) -from patsy.util import (have_pandas, - have_pandas_categorical, - have_pandas_categorical_dtype, - pandas_Categorical_from_codes) +from patsy.build import design_matrix_builders, build_design_matrices +from patsy.highlevel import dmatrix, dmatrices, incr_dbuilder, incr_dbuilders +from patsy.util import ( + have_pandas, + have_pandas_categorical, + have_pandas_categorical_dtype, + pandas_Categorical_from_codes, +) from patsy.origin import Origin if have_pandas: import pandas -def check_result(expect_full_designs, lhs, rhs, data, - expected_rhs_values, expected_rhs_names, - expected_lhs_values, expected_lhs_names): # pragma: no cover + +def check_result( + expect_full_designs, + lhs, + rhs, + data, + expected_rhs_values, + expected_rhs_names, + expected_lhs_values, + expected_lhs_names, +): # pragma: no cover assert np.allclose(rhs, expected_rhs_values) assert rhs.design_info.column_names == expected_rhs_names if lhs is not None: @@ -41,11 +49,11 @@ def check_result(expect_full_designs, lhs, rhs, data, if expect_full_designs: if lhs is None: - new_rhs, = build_design_matrices([rhs.design_info], data) + (new_rhs,) = build_design_matrices([rhs.design_info], data) else: - new_lhs, new_rhs = build_design_matrices([lhs.design_info, - rhs.design_info], - data) + new_lhs, new_rhs = build_design_matrices( + [lhs.design_info, rhs.design_info], data + ) assert np.allclose(new_lhs, lhs) assert new_lhs.design_info.column_names == expected_lhs_names assert np.allclose(new_rhs, rhs) @@ -54,30 +62,42 @@ def check_result(expect_full_designs, lhs, rhs, data, assert rhs.design_info.terms is None assert lhs is None or lhs.design_info.terms is None + def dmatrix_pandas(formula_like, data={}, depth=0, return_type="matrix"): return_type = "dataframe" if isinstance(depth, int): depth += 1 return dmatrix(formula_like, data, depth, return_type=return_type) + def dmatrices_pandas(formula_like, data={}, depth=0, return_type="matrix"): return_type = "dataframe" if isinstance(depth, int): depth += 1 return dmatrices(formula_like, data, depth, return_type=return_type) -def t(formula_like, data, depth, - expect_full_designs, - expected_rhs_values, expected_rhs_names, - expected_lhs_values=None, expected_lhs_names=None): # pragma: no cover + +def t( + formula_like, + data, + depth, + expect_full_designs, + expected_rhs_values, + expected_rhs_names, + expected_lhs_values=None, + expected_lhs_names=None, +): # pragma: no cover if isinstance(depth, int): depth += 1 + def data_iter_maker(): return iter([data]) - if (isinstance(formula_like, (str, ModelDesc, DesignInfo)) - or (isinstance(formula_like, tuple) - and isinstance(formula_like[0], DesignInfo)) - or hasattr(formula_like, "__patsy_get_model_desc__")): + + if ( + isinstance(formula_like, (str, ModelDesc, DesignInfo)) + or (isinstance(formula_like, tuple) and isinstance(formula_like[0], DesignInfo)) + or hasattr(formula_like, "__patsy_get_model_desc__") + ): if expected_lhs_values is None: builder = incr_dbuilder(formula_like, data_iter_maker, depth) lhs = None @@ -85,14 +105,19 @@ def data_iter_maker(): else: builders = incr_dbuilders(formula_like, data_iter_maker, depth) lhs, rhs = build_design_matrices(builders, data) - check_result(expect_full_designs, lhs, rhs, data, - expected_rhs_values, expected_rhs_names, - expected_lhs_values, expected_lhs_names) + check_result( + expect_full_designs, + lhs, + rhs, + data, + expected_rhs_values, + expected_rhs_names, + expected_lhs_values, + expected_lhs_names, + ) else: - pytest.raises(PatsyError, incr_dbuilders, - formula_like, data_iter_maker) - pytest.raises(PatsyError, incr_dbuilder, - formula_like, data_iter_maker) + pytest.raises(PatsyError, incr_dbuilders, formula_like, data_iter_maker) + pytest.raises(PatsyError, incr_dbuilder, formula_like, data_iter_maker) one_mat_fs = [dmatrix] two_mat_fs = [dmatrices] if have_pandas: @@ -101,9 +126,16 @@ def data_iter_maker(): if expected_lhs_values is None: for f in one_mat_fs: rhs = f(formula_like, data, depth) - check_result(expect_full_designs, None, rhs, data, - expected_rhs_values, expected_rhs_names, - expected_lhs_values, expected_lhs_names) + check_result( + expect_full_designs, + None, + rhs, + data, + expected_rhs_values, + expected_rhs_names, + expected_lhs_values, + expected_lhs_names, + ) # We inline assert_raises here to avoid complications with the # depth argument. @@ -125,11 +157,19 @@ def data_iter_maker(): for f in two_mat_fs: (lhs, rhs) = f(formula_like, data, depth) - check_result(expect_full_designs, lhs, rhs, data, - expected_rhs_values, expected_rhs_names, - expected_lhs_values, expected_lhs_names) - -def t_invalid(formula_like, data, depth, exc=PatsyError): # pragma: no cover + check_result( + expect_full_designs, + lhs, + rhs, + data, + expected_rhs_values, + expected_rhs_names, + expected_lhs_values, + expected_lhs_names, + ) + + +def t_invalid(formula_like, data, depth, exc=PatsyError): # pragma: no cover if isinstance(depth, int): depth += 1 fs = [dmatrix, dmatrices] @@ -143,52 +183,92 @@ def t_invalid(formula_like, data, depth, exc=PatsyError): # pragma: no cover else: raise AssertionError + # Exercise all the different calling conventions for the high-level API def test_formula_likes(): # Plain array-like, rhs only - t([[1, 2, 3], [4, 5, 6]], {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) - t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) - t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) - t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) + t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) + t( + (None, [[1, 2, 3], [4, 5, 6]]), + {}, + 0, + False, + [[1, 2, 3], [4, 5, 6]], + ["x0", "x1", "x2"], + ) + t( + np.asarray([[1, 2, 3], [4, 5, 6]]), + {}, + 0, + False, + [[1, 2, 3], [4, 5, 6]], + ["x0", "x1", "x2"], + ) + t( + (None, np.asarray([[1, 2, 3], [4, 5, 6]])), + {}, + 0, + False, + [[1, 2, 3], [4, 5, 6]], + ["x0", "x1", "x2"], + ) dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") - t(dm, {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) - t((None, dm), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) + t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) + t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) # Plain array-likes, lhs and rhs - t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], - [[1], [2]], ["y0"]) - t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], - [[1], [2]], ["y0"]) - t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], - [[1], [2]], ["y0"]) - t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], - [[1], [2]], ["y0"]) + t( + ([1, 2], [[1, 2, 3], [4, 5, 6]]), + {}, + 0, + False, + [[1, 2, 3], [4, 5, 6]], + ["x0", "x1", "x2"], + [[1], [2]], + ["y0"], + ) + t( + ([[1], [2]], [[1, 2, 3], [4, 5, 6]]), + {}, + 0, + False, + [[1, 2, 3], [4, 5, 6]], + ["x0", "x1", "x2"], + [[1], [2]], + ["y0"], + ) + t( + (np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), + {}, + 0, + False, + [[1, 2, 3], [4, 5, 6]], + ["x0", "x1", "x2"], + [[1], [2]], + ["y0"], + ) + t( + (np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), + {}, + 0, + False, + [[1, 2, 3], [4, 5, 6]], + ["x0", "x1", "x2"], + [[1], [2]], + ["y0"], + ) x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") y_dm = DesignMatrix([1, 2], default_column_prefix="bar") - t((y_dm, x_dm), {}, 0, - False, - [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"], - [[1], [2]], ["bar0"]) + t( + (y_dm, x_dm), + {}, + 0, + False, + [[1, 2, 3], [4, 5, 6]], + ["foo0", "foo1", "foo2"], + [[1], [2]], + ["bar0"], + ) # number of rows must match t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0) @@ -199,132 +279,210 @@ def test_formula_likes(): # plain Series and DataFrames if have_pandas: # Names are extracted - t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, - False, - [[1], [2], [3]], ["x"]) - t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, - False, - [[1], [2], [3]], ["asdf"]) - t((pandas.DataFrame({"y": [4, 5, 6]}), - pandas.DataFrame({"x": [1, 2, 3]})), {}, 0, - False, - [[1], [2], [3]], ["x"], - [[4], [5], [6]], ["y"]) - t((pandas.Series([4, 5, 6], name="y"), - pandas.Series([1, 2, 3], name="x")), {}, 0, - False, - [[1], [2], [3]], ["x"], - [[4], [5], [6]], ["y"]) + t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]], ["x"]) + t( + pandas.Series([1, 2, 3], name="asdf"), + {}, + 0, + False, + [[1], [2], [3]], + ["asdf"], + ) + t( + (pandas.DataFrame({"y": [4, 5, 6]}), pandas.DataFrame({"x": [1, 2, 3]})), + {}, + 0, + False, + [[1], [2], [3]], + ["x"], + [[4], [5], [6]], + ["y"], + ) + t( + (pandas.Series([4, 5, 6], name="y"), pandas.Series([1, 2, 3], name="x")), + {}, + 0, + False, + [[1], [2], [3]], + ["x"], + [[4], [5], [6]], + ["y"], + ) # Or invented - t((pandas.DataFrame([[4, 5, 6]]), - pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, - False, - [[1, 2, 3]], ["x7", "x8", "x9"], - [[4, 5, 6]], ["y0", "y1", "y2"]) - t(pandas.Series([1, 2, 3]), {}, 0, - False, - [[1], [2], [3]], ["x0"]) + t( + ( + pandas.DataFrame([[4, 5, 6]]), + pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9]), + ), + {}, + 0, + False, + [[1, 2, 3]], + ["x7", "x8", "x9"], + [[4, 5, 6]], + ["y0", "y1", "y2"], + ) + t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"]) # indices must match - t_invalid((pandas.DataFrame([[1]], index=[1]), - pandas.DataFrame([[1]], index=[2])), - {}, 0) + t_invalid( + (pandas.DataFrame([[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), + {}, + 0, + ) # Foreign ModelDesc factories class ForeignModelSource(object): def __patsy_get_model_desc__(self, data): - return ModelDesc([Term([LookupFactor("Y")])], - [Term([LookupFactor("X")])]) + return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])]) + foreign_model = ForeignModelSource() - t(foreign_model, - {"Y": [1, 2], - "X": [[1, 2], [3, 4]]}, - 0, - True, - [[1, 2], [3, 4]], ["X[0]", "X[1]"], - [[1], [2]], ["Y"]) + t( + foreign_model, + {"Y": [1, 2], "X": [[1, 2], [3, 4]]}, + 0, + True, + [[1, 2], [3, 4]], + ["X[0]", "X[1]"], + [[1], [2]], + ["Y"], + ) + class BadForeignModelSource(object): def __patsy_get_model_desc__(self, data): return data + t_invalid(BadForeignModelSource(), {}, 0) # string formulas - t("y ~ x", {"y": [1, 2], "x": [3, 4]}, 0, - True, - [[1, 3], [1, 4]], ["Intercept", "x"], - [[1], [2]], ["y"]) - t("~ x", {"y": [1, 2], "x": [3, 4]}, 0, - True, - [[1, 3], [1, 4]], ["Intercept", "x"]) - t("x + y", {"y": [1, 2], "x": [3, 4]}, 0, - True, - [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"]) + t( + "y ~ x", + {"y": [1, 2], "x": [3, 4]}, + 0, + True, + [[1, 3], [1, 4]], + ["Intercept", "x"], + [[1], [2]], + ["y"], + ) + t("~ x", {"y": [1, 2], "x": [3, 4]}, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"]) + t( + "x + y", + {"y": [1, 2], "x": [3, 4]}, + 0, + True, + [[1, 3, 1], [1, 4, 2]], + ["Intercept", "x", "y"], + ) # ModelDesc desc = ModelDesc([], [Term([LookupFactor("x")])]) - t(desc, {"x": [1.5, 2.5, 3.5]}, 0, - True, - [[1.5], [2.5], [3.5]], ["x"]) + t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"]) desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])]) - t(desc, {"x": [1.5, 2.5, 3.5]}, 0, - True, - [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"]) - desc = ModelDesc([Term([LookupFactor("y")])], - [Term([]), Term([LookupFactor("x")])]) - t(desc, {"x": [1.5, 2.5, 3.5], "y": [10, 20, 30]}, 0, - True, - [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"], - [[10], [20], [30]], ["y"]) + t( + desc, + {"x": [1.5, 2.5, 3.5]}, + 0, + True, + [[1, 1.5], [1, 2.5], [1, 3.5]], + ["Intercept", "x"], + ) + desc = ModelDesc([Term([LookupFactor("y")])], [Term([]), Term([LookupFactor("x")])]) + t( + desc, + {"x": [1.5, 2.5, 3.5], "y": [10, 20, 30]}, + 0, + True, + [[1, 1.5], [1, 2.5], [1, 3.5]], + ["Intercept", "x"], + [[10], [20], [30]], + ["y"], + ) # builders - termlists = ([], - [Term([LookupFactor("x")])], - [Term([]), Term([LookupFactor("x")])], - ) - builders = design_matrix_builders(termlists, - lambda: iter([{"x": [1, 2, 3]}]), - eval_env=0) + termlists = ( + [], + [Term([LookupFactor("x")])], + [Term([]), Term([LookupFactor("x")])], + ) + builders = design_matrix_builders( + termlists, lambda: iter([{"x": [1, 2, 3]}]), eval_env=0 + ) # twople but with no LHS - t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, - True, - [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) + t( + (builders[0], builders[2]), + {"x": [10, 20, 30]}, + 0, + True, + [[1, 10], [1, 20], [1, 30]], + ["Intercept", "x"], + ) # single DesignInfo - t(builders[2], {"x": [10, 20, 30]}, 0, - True, - [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) + t( + builders[2], + {"x": [10, 20, 30]}, + 0, + True, + [[1, 10], [1, 20], [1, 30]], + ["Intercept", "x"], + ) # twople with LHS - t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, - True, - [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], - [[10], [20], [30]], ["x"]) + t( + (builders[1], builders[2]), + {"x": [10, 20, 30]}, + 0, + True, + [[1, 10], [1, 20], [1, 30]], + ["Intercept", "x"], + [[10], [20], [30]], + ["x"], + ) # check depth arguments x_in_env = [1, 2, 3] - t("~ x_in_env", {}, 0, - True, - [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) - t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, - True, - [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"]) + t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) + t( + "~ x_in_env", + {"x_in_env": [10, 20, 30]}, + 0, + True, + [[1, 10], [1, 20], [1, 30]], + ["Intercept", "x_in_env"], + ) # Trying to pull x_in_env out of our *caller* shouldn't work. t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError)) + # But then again it should, if called from one down on the stack: def check_nested_call(): x_in_env = "asdf" - t("~ x_in_env", {}, 1, - True, - [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) + t( + "~ x_in_env", + {}, + 1, + True, + [[1, 1], [1, 2], [1, 3]], + ["Intercept", "x_in_env"], + ) + check_nested_call() # passing in an explicit EvalEnvironment also works: e = EvalEnvironment.capture(1) t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError)) e = EvalEnvironment.capture(0) + def check_nested_call_2(): x_in_env = "asdf" - t("~ x_in_env", {}, e, - True, - [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) + t( + "~ x_in_env", + {}, + e, + True, + [[1, 1], [1, 2], [1, 3]], + ["Intercept", "x_in_env"], + ) + check_nested_call_2() + def test_return_pandas(): if not have_pandas: return @@ -369,166 +527,220 @@ def test_return_pandas(): assert np.array_equal(df10.index, s1.index) # pandas must be available import patsy.highlevel + had_pandas = patsy.highlevel.have_pandas try: patsy.highlevel.have_pandas = False - pytest.raises(PatsyError, - dmatrix, "x", {"x": [1]}, 0, return_type="dataframe") - pytest.raises(PatsyError, - dmatrices, "y ~ x", {"x": [1], "y": [2]}, 0, - return_type="dataframe") + pytest.raises(PatsyError, dmatrix, "x", {"x": [1]}, 0, return_type="dataframe") + pytest.raises( + PatsyError, + dmatrices, + "y ~ x", + {"x": [1], "y": [2]}, + 0, + return_type="dataframe", + ) finally: patsy.highlevel.have_pandas = had_pandas + def test_term_info(): data = balanced(a=2, b=2) rhs = dmatrix("a:b", data) - assert rhs.design_info.column_names == ["Intercept", "b[T.b2]", - "a[T.a2]:b[b1]", "a[T.a2]:b[b2]"] + assert rhs.design_info.column_names == [ + "Intercept", + "b[T.b2]", + "a[T.a2]:b[b1]", + "a[T.a2]:b[b2]", + ] assert rhs.design_info.term_names == ["Intercept", "a:b"] assert len(rhs.design_info.terms) == 2 assert rhs.design_info.terms[0] == INTERCEPT + def test_data_types(): - data = {"a": [1, 2, 3], - "b": [1.0, 2.0, 3.0], - "c": np.asarray([1, 2, 3], dtype=np.float32), - "d": [True, False, True], - "e": ["foo", "bar", "baz"], - "f": C([1, 2, 3]), - "g": C(["foo", "bar", "baz"]), - "h": np.array(["foo", 1, (1, "hi")], dtype=object), - } - t("~ 0 + a", data, 0, True, - [[1], [2], [3]], ["a"]) - t("~ 0 + b", data, 0, True, - [[1], [2], [3]], ["b"]) - t("~ 0 + c", data, 0, True, - [[1], [2], [3]], ["c"]) - t("~ 0 + d", data, 0, True, - [[0, 1], [1, 0], [0, 1]], ["d[False]", "d[True]"]) - t("~ 0 + e", data, 0, True, - [[0, 0, 1], [1, 0, 0], [0, 1, 0]], ["e[bar]", "e[baz]", "e[foo]"]) - t("~ 0 + f", data, 0, True, - [[1, 0, 0], [0, 1, 0], [0, 0, 1]], ["f[1]", "f[2]", "f[3]"]) - t("~ 0 + g", data, 0, True, - [[0, 0, 1], [1, 0, 0], [0, 1, 0]], ["g[bar]", "g[baz]", "g[foo]"]) + data = { + "a": [1, 2, 3], + "b": [1.0, 2.0, 3.0], + "c": np.asarray([1, 2, 3], dtype=np.float32), + "d": [True, False, True], + "e": ["foo", "bar", "baz"], + "f": C([1, 2, 3]), + "g": C(["foo", "bar", "baz"]), + "h": np.array(["foo", 1, (1, "hi")], dtype=object), + } + t("~ 0 + a", data, 0, True, [[1], [2], [3]], ["a"]) + t("~ 0 + b", data, 0, True, [[1], [2], [3]], ["b"]) + t("~ 0 + c", data, 0, True, [[1], [2], [3]], ["c"]) + t("~ 0 + d", data, 0, True, [[0, 1], [1, 0], [0, 1]], ["d[False]", "d[True]"]) + t( + "~ 0 + e", + data, + 0, + True, + [[0, 0, 1], [1, 0, 0], [0, 1, 0]], + ["e[bar]", "e[baz]", "e[foo]"], + ) + t( + "~ 0 + f", + data, + 0, + True, + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + ["f[1]", "f[2]", "f[3]"], + ) + t( + "~ 0 + g", + data, + 0, + True, + [[0, 0, 1], [1, 0, 0], [0, 1, 0]], + ["g[bar]", "g[baz]", "g[foo]"], + ) # This depends on Python's sorting behavior: - t("~ 0 + h", data, 0, True, - [[0, 1, 0], [1, 0, 0], [0, 0, 1]], - ["h[1]", "h[foo]", "h[(1, 'hi')]"]) + t( + "~ 0 + h", + data, + 0, + True, + [[0, 1, 0], [1, 0, 0], [0, 0, 1]], + ["h[1]", "h[foo]", "h[(1, 'hi')]"], + ) + def test_categorical(): data = balanced(a=2, b=2) # There are more exhaustive tests for all the different coding options in # test_build; let's just make sure that C() and stuff works. - t("~ C(a)", data, 0, - True, - [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a)[T.a2]"]) - t("~ C(a, levels=['a2', 'a1'])", data, 0, - True, - [[1, 1], [1, 1], [1, 0], [1, 0]], - ["Intercept", "C(a, levels=['a2', 'a1'])[T.a1]"]) - t("~ C(a, Treatment(reference=-1))", data, 0, - True, - [[1, 1], [1, 1], [1, 0], [1, 0]], - ["Intercept", "C(a, Treatment(reference=-1))[T.a1]"]) + t( + "~ C(a)", + data, + 0, + True, + [[1, 0], [1, 0], [1, 1], [1, 1]], + ["Intercept", "C(a)[T.a2]"], + ) + t( + "~ C(a, levels=['a2', 'a1'])", + data, + 0, + True, + [[1, 1], [1, 1], [1, 0], [1, 0]], + ["Intercept", "C(a, levels=['a2', 'a1'])[T.a1]"], + ) + t( + "~ C(a, Treatment(reference=-1))", + data, + 0, + True, + [[1, 1], [1, 1], [1, 0], [1, 0]], + ["Intercept", "C(a, Treatment(reference=-1))[T.a1]"], + ) # Different interactions - t("a*b", data, 0, - True, - [[1, 0, 0, 0], - [1, 0, 1, 0], - [1, 1, 0, 0], - [1, 1, 1, 1]], - ["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"]) - t("0 + a:b", data, 0, - True, - [[1, 0, 0, 0], - [0, 0, 1, 0], - [0, 1, 0, 0], - [0, 0, 0, 1]], - ["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"]) - t("1 + a + a:b", data, 0, - True, - [[1, 0, 0, 0], - [1, 0, 1, 0], - [1, 1, 0, 0], - [1, 1, 0, 1]], - ["Intercept", "a[T.a2]", "a[a1]:b[T.b2]", "a[a2]:b[T.b2]"]) + t( + "a*b", + data, + 0, + True, + [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]], + ["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"], + ) + t( + "0 + a:b", + data, + 0, + True, + [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]], + ["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"], + ) + t( + "1 + a + a:b", + data, + 0, + True, + [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 0, 1]], + ["Intercept", "a[T.a2]", "a[a1]:b[T.b2]", "a[a2]:b[T.b2]"], + ) # Changing contrast with C() data["a"] = C(data["a"], Helmert) - t("a", data, 0, - True, - [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"]) - t("C(a, Treatment)", data, 0, - True, - [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a, Treatment)[T.a2]"]) + t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"]) + t( + "C(a, Treatment)", + data, + 0, + True, + [[1, 0], [1, 0], [1, 1], [1, 1]], + ["Intercept", "C(a, Treatment)[T.a2]"], + ) # That didn't affect the original object - t("a", data, 0, - True, - [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"]) + t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"]) + def test_builtins(): - data = {"x": [1, 2, 3], - "y": [4, 5, 6], - "a b c": [10, 20, 30]} - t("0 + I(x + y)", data, 0, - True, - [[1], [2], [3], [4], [5], [6]], ["I(x + y)"]) - t("Q('a b c')", data, 0, - True, - [[1, 10], [1, 20], [1, 30]], ["Intercept", "Q('a b c')"]) - t("center(x)", data, 0, - True, - [[1, -1], [1, 0], [1, 1]], ["Intercept", "center(x)"]) + data = {"x": [1, 2, 3], "y": [4, 5, 6], "a b c": [10, 20, 30]} + t("0 + I(x + y)", data, 0, True, [[1], [2], [3], [4], [5], [6]], ["I(x + y)"]) + t( + "Q('a b c')", + data, + 0, + True, + [[1, 10], [1, 20], [1, 30]], + ["Intercept", "Q('a b c')"], + ) + t("center(x)", data, 0, True, [[1, -1], [1, 0], [1, 1]], ["Intercept", "center(x)"]) + def test_incremental(): # incr_dbuilder(s) # stateful transformations datas = [ - {"a": ["a2", "a2", "a2"], - "x": [1, 2, 3]}, - {"a": ["a2", "a2", "a1"], - "x": [4, 5, 6]}, - ] + {"a": ["a2", "a2", "a2"], "x": [1, 2, 3]}, + {"a": ["a2", "a2", "a1"], "x": [4, 5, 6]}, + ] x = np.asarray([1, 2, 3, 4, 5, 6]) sin_center_x = np.sin(x - np.mean(x)) x_col = sin_center_x - np.mean(sin_center_x) + def data_iter_maker(): return iter(datas) - builders = incr_dbuilders("1 ~ a + center(np.sin(center(x)))", - data_iter_maker) + + builders = incr_dbuilders("1 ~ a + center(np.sin(center(x)))", data_iter_maker) lhs, rhs = build_design_matrices(builders, datas[1]) assert lhs.design_info.column_names == ["Intercept"] - assert rhs.design_info.column_names == ["Intercept", - "a[T.a2]", - "center(np.sin(center(x)))"] + assert rhs.design_info.column_names == [ + "Intercept", + "a[T.a2]", + "center(np.sin(center(x)))", + ] assert np.allclose(lhs, [[1], [1], [1]]) - assert np.allclose(rhs, np.column_stack(([1, 1, 1], - [1, 1, 0], - x_col[3:]))) + assert np.allclose(rhs, np.column_stack(([1, 1, 1], [1, 1, 0], x_col[3:]))) - builder = incr_dbuilder("~ a + center(np.sin(center(x)))", - data_iter_maker) + builder = incr_dbuilder("~ a + center(np.sin(center(x)))", data_iter_maker) (rhs,) = build_design_matrices([builder], datas[1]) - assert rhs.design_info.column_names == ["Intercept", - "a[T.a2]", - "center(np.sin(center(x)))"] + assert rhs.design_info.column_names == [ + "Intercept", + "a[T.a2]", + "center(np.sin(center(x)))", + ] assert np.allclose(lhs, [[1], [1], [1]]) - assert np.allclose(rhs, np.column_stack(([1, 1, 1], - [1, 1, 0], - x_col[3:]))) + assert np.allclose(rhs, np.column_stack(([1, 1, 1], [1, 1, 0], x_col[3:]))) pytest.raises(PatsyError, incr_dbuilder, "x ~ x", data_iter_maker) pytest.raises(PatsyError, incr_dbuilders, "x", data_iter_maker) + def test_env_transform(): - t("~ np.sin(x)", {"x": [1, 2, 3]}, 0, - True, - [[1, np.sin(1)], [1, np.sin(2)], [1, np.sin(3)]], - ["Intercept", "np.sin(x)"]) + t( + "~ np.sin(x)", + {"x": [1, 2, 3]}, + 0, + True, + [[1, np.sin(1)], [1, np.sin(2)], [1, np.sin(3)]], + ["Intercept", "np.sin(x)"], + ) + # Term ordering: # 1) all 0-order no-numeric @@ -553,16 +765,14 @@ def t_terms(formula, order): t_terms("b + a + x2 + x1", ["Intercept", "b", "a", "x2", "x1"]) t_terms("0 + x1 + a + x2 + b + 1", ["Intercept", "a", "b", "x1", "x2"]) t_terms("0 + a:b + a + b + 1", ["Intercept", "a", "b", "a:b"]) - t_terms("a + a:x1 + x2 + x1 + b", - ["Intercept", "a", "b", "x1", "a:x1", "x2"]) - t_terms("0 + a:x1:x2 + a + x2:x1:b + x2 + x1 + a:x1 + x1:x2 + x1:a:x2:a:b", - ["a", - "x1:x2", "a:x1:x2", "x2:x1:b", "x1:a:x2:b", - "x2", - "x1", - "a:x1"]) - -def _check_division(expect_true_division): # pragma: no cover + t_terms("a + a:x1 + x2 + x1 + b", ["Intercept", "a", "b", "x1", "a:x1", "x2"]) + t_terms( + "0 + a:x1:x2 + a + x2:x1:b + x2 + x1 + a:x1 + x1:x2 + x1:a:x2:a:b", + ["a", "x1:x2", "a:x1:x2", "x2:x1:b", "x1:a:x2:b", "x2", "x1", "a:x1"], + ) + + +def _check_division(expect_true_division): # pragma: no cover # We evaluate the formula "I(x / y)" in our *caller's* scope, so the # result depends on whether our caller has done 'from __future__ import # division'. @@ -573,25 +783,51 @@ def _check_division(expect_true_division): # pragma: no cover else: assert np.allclose(m, [[2]]) + def test_multicolumn(): data = { "a": ["a1", "a2"], "X": [[1, 2], [3, 4]], "Y": [[1, 3], [2, 4]], - } - t("X*Y", data, 0, - True, - [[1, 1, 2, 1, 3, 1 * 1, 2 * 1, 1 * 3, 2 * 3], - [1, 3, 4, 2, 4, 3 * 2, 4 * 2, 3 * 4, 4 * 4]], - ["Intercept", "X[0]", "X[1]", "Y[0]", "Y[1]", - "X[0]:Y[0]", "X[1]:Y[0]", "X[0]:Y[1]", "X[1]:Y[1]"]) - t("a:X + Y", data, 0, - True, - [[1, 1, 0, 2, 0, 1, 3], - [1, 0, 3, 0, 4, 2, 4]], - ["Intercept", - "a[a1]:X[0]", "a[a2]:X[0]", "a[a1]:X[1]", "a[a2]:X[1]", - "Y[0]", "Y[1]"]) + } + t( + "X*Y", + data, + 0, + True, + [ + [1, 1, 2, 1, 3, 1 * 1, 2 * 1, 1 * 3, 2 * 3], + [1, 3, 4, 2, 4, 3 * 2, 4 * 2, 3 * 4, 4 * 4], + ], + [ + "Intercept", + "X[0]", + "X[1]", + "Y[0]", + "Y[1]", + "X[0]:Y[0]", + "X[1]:Y[0]", + "X[0]:Y[1]", + "X[1]:Y[1]", + ], + ) + t( + "a:X + Y", + data, + 0, + True, + [[1, 1, 0, 2, 0, 1, 3], [1, 0, 3, 0, 4, 2, 4]], + [ + "Intercept", + "a[a1]:X[0]", + "a[a2]:X[0]", + "a[a1]:X[1]", + "a[a2]:X[1]", + "Y[0]", + "Y[1]", + ], + ) + def test_dmatrix_dmatrices_no_data(): x = [1, 2, 3] @@ -601,19 +837,22 @@ def test_dmatrix_dmatrices_no_data(): assert np.allclose(lhs, [[4], [5], [6]]) assert np.allclose(rhs, [[1, 1], [1, 2], [1, 3]]) + def test_designinfo_describe(): - lhs, rhs = dmatrices("y ~ x + a", {"y": [1, 2, 3], - "x": [4, 5, 6], - "a": ["a1", "a2", "a3"]}) + lhs, rhs = dmatrices( + "y ~ x + a", {"y": [1, 2, 3], "x": [4, 5, 6], "a": ["a1", "a2", "a3"]} + ) assert lhs.design_info.describe() == "y" assert rhs.design_info.describe() == "1 + a + x" + def test_evalfactor_reraise(): # This will produce a PatsyError, but buried inside the factor evaluation, # so the original code has no way to give it an appropriate origin= # attribute. EvalFactor should notice this, and add a useful origin: def raise_patsy_error(x): raise PatsyError("WHEEEEEE") + formula = "raise_patsy_error(X) + Y" try: dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]}) @@ -632,6 +871,7 @@ def raise_patsy_error(x): else: assert False + def test_dmatrix_NA_action(): data = {"x": [1, 2, 3, np.nan], "y": [np.nan, 20, 30, 40]} @@ -641,13 +881,17 @@ def test_dmatrix_NA_action(): for return_type in return_types: mat = dmatrix("x + y", data=data, return_type=return_type) - assert np.array_equal(mat, [[1, 2, 20], - [1, 3, 30]]) + assert np.array_equal(mat, [[1, 2, 20], [1, 3, 30]]) if return_type == "dataframe": assert mat.index.equals(pandas.Index([1, 2])) - pytest.raises(PatsyError, dmatrix, "x + y", data=data, - return_type=return_type, - NA_action="raise") + pytest.raises( + PatsyError, + dmatrix, + "x + y", + data=data, + return_type=return_type, + NA_action="raise", + ) lmat, rmat = dmatrices("y ~ x", data=data, return_type=return_type) assert np.array_equal(lmat, [[20], [30]]) @@ -655,9 +899,14 @@ def test_dmatrix_NA_action(): if return_type == "dataframe": assert lmat.index.equals(pandas.Index([1, 2])) assert rmat.index.equals(pandas.Index([1, 2])) - pytest.raises(PatsyError, - dmatrices, "y ~ x", data=data, return_type=return_type, - NA_action="raise") + pytest.raises( + PatsyError, + dmatrices, + "y ~ x", + data=data, + return_type=return_type, + NA_action="raise", + ) # Initial release for the NA handling code had problems with # non-data-dependent matrices like "~ 1". @@ -667,31 +916,38 @@ def test_dmatrix_NA_action(): if return_type == "dataframe": assert lmat.index.equals(pandas.Index([1, 2, 3])) assert rmat.index.equals(pandas.Index([1, 2, 3])) - pytest.raises(PatsyError, - dmatrices, "y ~ 1", data=data, return_type=return_type, - NA_action="raise") + pytest.raises( + PatsyError, + dmatrices, + "y ~ 1", + data=data, + return_type=return_type, + NA_action="raise", + ) + def test_0d_data(): # Use case from statsmodels/statsmodels#1881 data_0d = {"x1": 1.1, "x2": 1.2, "a": "a1"} for formula, expected in [ - ("x1 + x2", [[1, 1.1, 1.2]]), - ("C(a, levels=('a1', 'a2')) + x1", [[1, 0, 1.1]]), - ]: + ("x1 + x2", [[1, 1.1, 1.2]]), + ("C(a, levels=('a1', 'a2')) + x1", [[1, 0, 1.1]]), + ]: mat = dmatrix(formula, data_0d) assert np.allclose(mat, expected) - assert np.allclose(build_design_matrices([mat.design_info], - data_0d)[0], - expected) + assert np.allclose( + build_design_matrices([mat.design_info], data_0d)[0], expected + ) if have_pandas: data_series = pandas.Series(data_0d) assert np.allclose(dmatrix(formula, data_series), expected) - assert np.allclose(build_design_matrices([mat.design_info], - data_series)[0], - expected) + assert np.allclose( + build_design_matrices([mat.design_info], data_series)[0], expected + ) + def test_env_not_saved_in_builder(): x_in_env = [1, 2, 3] @@ -702,6 +958,7 @@ def test_env_not_saved_in_builder(): assert np.allclose(design_matrix, design_matrix2) + def test_C_and_pandas_categorical(): if not have_pandas_categorical: return @@ -711,22 +968,14 @@ def test_C_and_pandas_categorical(): objs.append(pandas.Series(objs[0])) for obj in objs: d = {"obj": obj} - assert np.allclose(dmatrix("obj", d), - [[1, 1], - [1, 0], - [1, 1]]) - - assert np.allclose(dmatrix("C(obj)", d), - [[1, 1], - [1, 0], - [1, 1]]) - - assert np.allclose(dmatrix("C(obj, levels=['b', 'a'])", d), - [[1, 1], - [1, 0], - [1, 1]]) - - assert np.allclose(dmatrix("C(obj, levels=['a', 'b'])", d), - [[1, 0], - [1, 1], - [1, 0]]) + assert np.allclose(dmatrix("obj", d), [[1, 1], [1, 0], [1, 1]]) + + assert np.allclose(dmatrix("C(obj)", d), [[1, 1], [1, 0], [1, 1]]) + + assert np.allclose( + dmatrix("C(obj, levels=['b', 'a'])", d), [[1, 1], [1, 0], [1, 1]] + ) + + assert np.allclose( + dmatrix("C(obj, levels=['a', 'b'])", d), [[1, 0], [1, 1], [1, 0]] + ) diff --git a/patsy/test_regressions.py b/patsy/test_regressions.py index 8ab2d6d..2760846 100644 --- a/patsy/test_regressions.py +++ b/patsy/test_regressions.py @@ -5,16 +5,16 @@ # Regression tests for fixed bugs (when not otherwise better covered somewhere # else) -from patsy import (EvalEnvironment, dmatrix, build_design_matrices, - PatsyError, Origin) +from patsy import EvalEnvironment, dmatrix, build_design_matrices, PatsyError, Origin + def test_issue_11(): # Give a sensible error message for level mismatches # (At some points we've failed to put an origin= on these errors) env = EvalEnvironment.capture() - data = {"X" : [0,1,2,3], "Y" : [1,2,3,4]} + data = {"X": [0, 1, 2, 3], "Y": [1, 2, 3, 4]} formula = "C(X) + Y" - new_data = {"X" : [0,0,1,2,3,3,4], "Y" : [1,2,3,4,5,6,7]} + new_data = {"X": [0, 0, 1, 2, 3, 3, 4], "Y": [1, 2, 3, 4, 5, 6, 7]} info = dmatrix(formula, data) try: build_design_matrices([info.design_info], new_data) diff --git a/patsy/test_splines_bs_data.py b/patsy/test_splines_bs_data.py index ba02495..6b233fc 100644 --- a/patsy/test_splines_bs_data.py +++ b/patsy/test_splines_bs_data.py @@ -1,7 +1,31 @@ # This file auto-generated by tools/get-R-bs-test-vectors.R # Using: R version 2.15.1 (2012-06-22) import numpy as np -R_bs_test_x = np.array([1, 1.5, 2.25, 3.375, 5.0625, 7.59375, 11.390625, 17.0859375, 25.62890625, 38.443359375, 57.6650390625, 86.49755859375, 129.746337890625, 194.6195068359375, 291.92926025390625, 437.893890380859375, 656.8408355712890625, 985.26125335693359375, 1477.8918800354003906, 2216.8378200531005859, ]) + +R_bs_test_x = np.array( + [ + 1, + 1.5, + 2.25, + 3.375, + 5.0625, + 7.59375, + 11.390625, + 17.0859375, + 25.62890625, + 38.443359375, + 57.6650390625, + 86.49755859375, + 129.746337890625, + 194.6195068359375, + 291.92926025390625, + 437.893890380859375, + 656.8408355712890625, + 985.26125335693359375, + 1477.8918800354003906, + 2216.8378200531005859, + ] +) R_bs_test_data = """ --BEGIN TEST CASE-- degree=1 diff --git a/patsy/test_splines_crs_data.py b/patsy/test_splines_crs_data.py index a0dcaa2..3b85f71 100644 --- a/patsy/test_splines_crs_data.py +++ b/patsy/test_splines_crs_data.py @@ -1,7 +1,31 @@ # This file auto-generated by tools/get-R-crs-test-vectors.R # Using: R version 3.0.3 (2014-03-06) and package 'mgcv' version 1.7.28 import numpy as np -R_crs_test_x = np.array([1, -1.5, 2.25, -3.375, 5.0625, -7.59375, 11.390625, -17.0859375, 25.628906250000000000, -38.443359375000000000, 57.665039062500000000, -86.497558593750000000, 129.74633789062500000, -194.6195068359375, 291.92926025390625000, -437.89389038085937500, 656.84083557128906250, -985.26125335693359375, 1477.8918800354003906, -2216.8378200531005859, ]) + +R_crs_test_x = np.array( + [ + 1, + -1.5, + 2.25, + -3.375, + 5.0625, + -7.59375, + 11.390625, + -17.0859375, + 25.628906250000000000, + -38.443359375000000000, + 57.665039062500000000, + -86.497558593750000000, + 129.74633789062500000, + -194.6195068359375, + 291.92926025390625000, + -437.89389038085937500, + 656.84083557128906250, + -985.26125335693359375, + 1477.8918800354003906, + -2216.8378200531005859, + ] +) R_crs_test_data = """ --BEGIN TEST CASE-- spline_type=cr diff --git a/patsy/test_state.py b/patsy/test_state.py index 3c04611..2c5a8e8 100644 --- a/patsy/test_state.py +++ b/patsy/test_state.py @@ -7,6 +7,7 @@ from patsy.state import Center, Standardize, center from patsy.util import atleast_2d_column_default + def check_stateful(cls, accepts_multicolumn, input, output, *args, **kwargs): input = np.asarray(input) output = np.asarray(output) @@ -27,19 +28,25 @@ def check_stateful(cls, accepts_multicolumn, input, output, *args, **kwargs): ([np.array(input)[:, None]], atleast_2d_column_default(output)), # 2-d but 1 column input, many chunks: ([np.array([[n]]) for n in input], atleast_2d_column_default(output)), - ] + ] if accepts_multicolumn: # 2-d array input, one chunk: test_cases += [ - ([np.column_stack((input, input[::-1]))], - np.column_stack((output, output[::-1]))), + ( + [np.column_stack((input, input[::-1]))], + np.column_stack((output, output[::-1])), + ), # 2-d array input, many chunks: - ([np.array([[input[i], input[-i-1]]]) for i in range(len(input))], - np.column_stack((output, output[::-1]))), - ] + ( + [np.array([[input[i], input[-i - 1]]]) for i in range(len(input))], + np.column_stack((output, output[::-1])), + ), + ] from patsy.util import have_pandas + if have_pandas: import pandas + pandas_type = (pandas.Series, pandas.DataFrame) pandas_index = np.linspace(0, 1, num=len(input)) # 1d and 2d here refer to the dimensionality of the input @@ -51,24 +58,32 @@ def check_stateful(cls, accepts_multicolumn, input, output, *args, **kwargs): # Series input, one chunk ([pandas.Series(input, index=pandas_index)], output_1d), # Series input, many chunks - ([pandas.Series([x], index=[idx]) - for (x, idx) in zip(input, pandas_index)], - output_1d), - ] + ( + [ + pandas.Series([x], index=[idx]) + for (x, idx) in zip(input, pandas_index) + ], + output_1d, + ), + ] if accepts_multicolumn: input_2d_2col = np.column_stack((input, input[::-1])) output_2d_2col = np.column_stack((output, output[::-1])) - output_2col_dataframe = pandas.DataFrame(output_2d_2col, - index=pandas_index) + output_2col_dataframe = pandas.DataFrame(output_2d_2col, index=pandas_index) test_cases += [ # DataFrame input, one chunk - ([pandas.DataFrame(input_2d_2col, index=pandas_index)], - output_2col_dataframe), + ( + [pandas.DataFrame(input_2d_2col, index=pandas_index)], + output_2col_dataframe, + ), # DataFrame input, many chunks - ([pandas.DataFrame([input_2d_2col[i, :]], - index=[pandas_index[i]]) - for i in range(len(input))], - output_2col_dataframe), + ( + [ + pandas.DataFrame([input_2d_2col[i, :]], index=[pandas_index[i]]) + for i in range(len(input)) + ], + output_2col_dataframe, + ), ] for input_obj, output_obj in test_cases: print(input_obj) @@ -113,28 +128,31 @@ def check_stateful(cls, accepts_multicolumn, input, output, *args, **kwargs): assert all_output2.ndim == all_input.ndim assert np.allclose(all_output2, output_obj) + def test_Center(): check_stateful(Center, True, [1, 2, 3], [-1, 0, 1]) check_stateful(Center, True, [1, 2, 1, 2], [-0.5, 0.5, -0.5, 0.5]) - check_stateful(Center, True, - [1.3, -10.1, 7.0, 12.0], - [-1.25, -12.65, 4.45, 9.45]) + check_stateful(Center, True, [1.3, -10.1, 7.0, 12.0], [-1.25, -12.65, 4.45, 9.45]) + def test_stateful_transform_wrapper(): assert np.allclose(center([1, 2, 3]), [-1, 0, 1]) assert np.allclose(center([1, 2, 1, 2]), [-0.5, 0.5, -0.5, 0.5]) assert center([1.0, 2.0, 3.0]).dtype == np.dtype(float) - assert (center(np.array([1.0, 2.0, 3.0], dtype=np.float32)).dtype - == np.dtype(np.float32)) + assert center(np.array([1.0, 2.0, 3.0], dtype=np.float32)).dtype == np.dtype( + np.float32 + ) assert center([1, 2, 3]).dtype == np.dtype(float) from patsy.util import have_pandas + if have_pandas: import pandas + s = pandas.Series([1, 2, 3], index=["a", "b", "c"]) - df = pandas.DataFrame([[1, 2], [2, 4], [3, 6]], - columns=["x1", "x2"], - index=[10, 20, 30]) + df = pandas.DataFrame( + [[1, 2], [2, 4], [3, 6]], columns=["x1", "x2"], index=[10, 20, 30] + ) s_c = center(s) assert isinstance(s_c, pandas.Series) assert np.array_equal(s_c.index, ["a", "b", "c"]) @@ -145,16 +163,17 @@ def test_stateful_transform_wrapper(): assert np.array_equal(df_c.columns, ["x1", "x2"]) assert np.allclose(df_c, [[-1, -2], [0, 0], [1, 2]]) + def test_Standardize(): check_stateful(Standardize, True, [1, -1], [1, -1]) check_stateful(Standardize, True, [12, 10], [1, -1]) - check_stateful(Standardize, True, - [12, 11, 10], - [np.sqrt(3./2), 0, -np.sqrt(3./2)]) + check_stateful( + Standardize, True, [12, 11, 10], [np.sqrt(3.0 / 2), 0, -np.sqrt(3.0 / 2)] + ) - check_stateful(Standardize, True, - [12.0, 11.0, 10.0], - [np.sqrt(3./2), 0, -np.sqrt(3./2)]) + check_stateful( + Standardize, True, [12.0, 11.0, 10.0], [np.sqrt(3.0 / 2), 0, -np.sqrt(3.0 / 2)] + ) # XX: see the comment in Standardize.transform about why this doesn't # work: @@ -164,26 +183,25 @@ def test_Standardize(): r20 = list(range(20)) - check_stateful(Standardize, True, [1, -1], [np.sqrt(2)/2, -np.sqrt(2)/2], - ddof=1) - - check_stateful(Standardize, True, - r20, - list((np.arange(20) - 9.5) / 5.7662812973353983), - ddof=0) - check_stateful(Standardize, True, - r20, - list((np.arange(20) - 9.5) / 5.9160797830996161), - ddof=1) - check_stateful(Standardize, True, - r20, - list((np.arange(20) - 9.5)), - rescale=False, ddof=1) - check_stateful(Standardize, True, - r20, - list(np.arange(20) / 5.9160797830996161), - center=False, ddof=1) - check_stateful(Standardize, True, - r20, - r20, - center=False, rescale=False, ddof=1) + check_stateful( + Standardize, True, [1, -1], [np.sqrt(2) / 2, -np.sqrt(2) / 2], ddof=1 + ) + + check_stateful( + Standardize, True, r20, list((np.arange(20) - 9.5) / 5.7662812973353983), ddof=0 + ) + check_stateful( + Standardize, True, r20, list((np.arange(20) - 9.5) / 5.9160797830996161), ddof=1 + ) + check_stateful( + Standardize, True, r20, list((np.arange(20) - 9.5)), rescale=False, ddof=1 + ) + check_stateful( + Standardize, + True, + r20, + list(np.arange(20) / 5.9160797830996161), + center=False, + ddof=1, + ) + check_stateful(Standardize, True, r20, r20, center=False, rescale=False, ddof=1) diff --git a/patsy/tokens.py b/patsy/tokens.py index 542d464..9cc500c 100644 --- a/patsy/tokens.py +++ b/patsy/tokens.py @@ -17,8 +17,8 @@ from patsy import PatsyError from patsy.origin import Origin -__all__ = ["python_tokenize", "pretty_untokenize", - "normalize_token_spacing"] +__all__ = ["python_tokenize", "pretty_untokenize", "normalize_token_spacing"] + # A convenience wrapper around tokenize.generate_tokens. yields tuples # (tokenize type, token string, origin object) @@ -29,7 +29,7 @@ def python_tokenize(code): code = code.replace("\n", " ").strip() it = tokenize.generate_tokens(StringIO(code).readline) try: - for (pytype, string, (_, start), (_, end), code) in it: + for pytype, string, (_, start), (_, end), code in it: if pytype == tokenize.ENDMARKER: break if pytype in (tokenize.NL, tokenize.NEWLINE): @@ -37,13 +37,13 @@ def python_tokenize(code): continue origin = Origin(code, start, end) if pytype == tokenize.ERRORTOKEN: - raise PatsyError("error tokenizing input " - "(maybe an unclosed string?)", - origin) + raise PatsyError( + "error tokenizing input " "(maybe an unclosed string?)", origin + ) if pytype == tokenize.COMMENT: raise PatsyError("comments are not allowed", origin) yield (pytype, string, origin) - else: # pragma: no cover + else: # pragma: no cover raise ValueError("stream ended without ENDMARKER?!?") except tokenize.TokenError as e: # TokenError is raised iff the tokenizer thinks that there is @@ -63,40 +63,55 @@ def python_tokenize(code): assert "EOF in multi-line" in e.args[0] return + def test_python_tokenize(): code = "a + (foo * -1)" tokens = list(python_tokenize(code)) - expected = [(tokenize.NAME, "a", Origin(code, 0, 1)), - (tokenize.OP, "+", Origin(code, 2, 3)), - (tokenize.OP, "(", Origin(code, 4, 5)), - (tokenize.NAME, "foo", Origin(code, 5, 8)), - (tokenize.OP, "*", Origin(code, 9, 10)), - (tokenize.OP, "-", Origin(code, 11, 12)), - (tokenize.NUMBER, "1", Origin(code, 12, 13)), - (tokenize.OP, ")", Origin(code, 13, 14))] + expected = [ + (tokenize.NAME, "a", Origin(code, 0, 1)), + (tokenize.OP, "+", Origin(code, 2, 3)), + (tokenize.OP, "(", Origin(code, 4, 5)), + (tokenize.NAME, "foo", Origin(code, 5, 8)), + (tokenize.OP, "*", Origin(code, 9, 10)), + (tokenize.OP, "-", Origin(code, 11, 12)), + (tokenize.NUMBER, "1", Origin(code, 12, 13)), + (tokenize.OP, ")", Origin(code, 13, 14)), + ] assert tokens == expected code2 = "a + (b" tokens2 = list(python_tokenize(code2)) - expected2 = [(tokenize.NAME, "a", Origin(code2, 0, 1)), - (tokenize.OP, "+", Origin(code2, 2, 3)), - (tokenize.OP, "(", Origin(code2, 4, 5)), - (tokenize.NAME, "b", Origin(code2, 5, 6))] + expected2 = [ + (tokenize.NAME, "a", Origin(code2, 0, 1)), + (tokenize.OP, "+", Origin(code2, 2, 3)), + (tokenize.OP, "(", Origin(code2, 4, 5)), + (tokenize.NAME, "b", Origin(code2, 5, 6)), + ] assert tokens2 == expected2 import pytest + pytest.raises(PatsyError, list, python_tokenize("a b # c")) import pytest - pytest.raises(PatsyError, list, python_tokenize("a b \"c")) -_python_space_both = (list("+-*/%&^|<>") - + ["==", "<>", "!=", "<=", ">=", - "<<", ">>", "**", "//"]) -_python_space_before = (_python_space_both - + ["!", "~"]) -_python_space_after = (_python_space_both - + [",", ":"]) + pytest.raises(PatsyError, list, python_tokenize('a b "c')) + + +_python_space_both = list("+-*/%&^|<>") + [ + "==", + "<>", + "!=", + "<=", + ">=", + "<<", + ">>", + "**", + "//", +] +_python_space_before = _python_space_both + ["!", "~"] +_python_space_after = _python_space_both + [",", ":"] + def pretty_untokenize(typed_tokens): text = [] @@ -106,8 +121,7 @@ def pretty_untokenize(typed_tokens): prev_was_object_like = False brackets = [] for token_type, token in typed_tokens: - assert token_type not in (tokenize.INDENT, tokenize.DEDENT, - tokenize.NL) + assert token_type not in (tokenize.INDENT, tokenize.DEDENT, tokenize.NL) if token_type == tokenize.NEWLINE: continue if token_type == tokenize.ENDMARKER: @@ -123,8 +137,8 @@ def pretty_untokenize(typed_tokens): brackets.append(token) elif brackets and token in (")", "]", "}"): brackets.pop() - this_wants_space_before = (token in _python_space_before) - this_wants_space_after = (token in _python_space_after) + this_wants_space_before = token in _python_space_before + this_wants_space_after = token in _python_space_after # Special case for slice syntax: foo[:10] # Otherwise ":" is spaced after, like: "{1: ...}", "if a: ..." if token == ":" and brackets and brackets[-1] == "[": @@ -149,19 +163,22 @@ def pretty_untokenize(typed_tokens): text.append(token) prev_wants_space = this_wants_space_after prev_was_space_delim = False - if (token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING) - or token == ")"): + if ( + token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING) + or token == ")" + ): prev_was_object_like = True else: prev_was_object_like = False prev_was_open_paren_or_comma = token in ("(", ",") return "".join(text) + def normalize_token_spacing(code): - tokens = [(t[0], t[1]) - for t in tokenize.generate_tokens(StringIO(code).readline)] + tokens = [(t[0], t[1]) for t in tokenize.generate_tokens(StringIO(code).readline)] return pretty_untokenize(tokens) + def test_pretty_untokenize_and_normalize_token_spacing(): assert normalize_token_spacing("1 + 1") == "1 + 1" assert normalize_token_spacing("1+1") == "1 + 1" diff --git a/patsy/user_util.py b/patsy/user_util.py index c40c946..080af84 100644 --- a/patsy/user_util.py +++ b/patsy/user_util.py @@ -14,6 +14,7 @@ from patsy.categorical import C from patsy.util import no_pickling, assert_no_pickling + def balanced(**kwargs): """balanced(factor_name=num_levels, [factor_name=num_levels, ..., repeat=1]) @@ -54,15 +55,41 @@ def balanced(**kwargs): data[name] = list(value) * repeat return data + def test_balanced(): data = balanced(a=2, b=3) assert data["a"] == ["a1", "a1", "a1", "a2", "a2", "a2"] assert data["b"] == ["b1", "b2", "b3", "b1", "b2", "b3"] data = balanced(a=2, b=3, repeat=2) - assert data["a"] == ["a1", "a1", "a1", "a2", "a2", "a2", - "a1", "a1", "a1", "a2", "a2", "a2"] - assert data["b"] == ["b1", "b2", "b3", "b1", "b2", "b3", - "b1", "b2", "b3", "b1", "b2", "b3"] + assert data["a"] == [ + "a1", + "a1", + "a1", + "a2", + "a2", + "a2", + "a1", + "a1", + "a1", + "a2", + "a2", + "a2", + ] + assert data["b"] == [ + "b1", + "b2", + "b3", + "b1", + "b2", + "b3", + "b1", + "b2", + "b3", + "b1", + "b2", + "b3", + ] + def demo_data(*names, **kwargs): """demo_data(*names, nlevels=2, min_rows=5) @@ -119,6 +146,7 @@ def demo_data(*names, **kwargs): data[name] = r.normal(size=num_rows) return data + def test_demo_data(): d1 = demo_data("a", "b", "x") assert sorted(d1.keys()) == ["a", "b", "x"] @@ -136,9 +164,11 @@ def test_demo_data(): assert len(demo_data("a", "b", "x", min_rows=10, nlevels=3)["x"]) == 18 import pytest + pytest.raises(PatsyError, demo_data, "a", "b", "__123") pytest.raises(TypeError, demo_data, "a", "b", asdfasdf=123) + class LookupFactor(object): """A simple factor class that simply looks up a named entry in the given data. @@ -166,9 +196,10 @@ class LookupFactor(object): .. versionadded:: 0.2.0 The ``force_categorical`` and related arguments. """ - def __init__(self, varname, - force_categorical=False, contrast=None, levels=None, - origin=None): + + def __init__( + self, varname, force_categorical=False, contrast=None, levels=None, origin=None + ): self._varname = varname self._force_categorical = force_categorical self._contrast = contrast @@ -187,26 +218,35 @@ def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self._varname) def __eq__(self, other): - return (isinstance(other, LookupFactor) - and self._varname == other._varname - and self._force_categorical == other._force_categorical - and self._contrast == other._contrast - and self._levels == other._levels) + return ( + isinstance(other, LookupFactor) + and self._varname == other._varname + and self._force_categorical == other._force_categorical + and self._contrast == other._contrast + and self._levels == other._levels + ) def __ne__(self, other): return not self == other def __hash__(self): - return hash((LookupFactor, self._varname, - self._force_categorical, self._contrast, self._levels)) + return hash( + ( + LookupFactor, + self._varname, + self._force_categorical, + self._contrast, + self._levels, + ) + ) def memorize_passes_needed(self, state, eval_env): return 0 - def memorize_chunk(self, state, which_pass, data): # pragma: no cover + def memorize_chunk(self, state, which_pass, data): # pragma: no cover assert False - def memorize_finish(self, state, which_pass): # pragma: no cover + def memorize_finish(self, state, which_pass): # pragma: no cover assert False def eval(self, memorize_state, data): @@ -217,6 +257,7 @@ def eval(self, memorize_state, data): __getstate__ = no_pickling + def test_LookupFactor(): l_a = LookupFactor("a") assert l_a.name() == "a" @@ -231,14 +272,14 @@ def test_LookupFactor(): l_with_origin = LookupFactor("b", origin="asdf") assert l_with_origin.origin == "asdf" - l_c = LookupFactor("c", force_categorical=True, - contrast="CONTRAST", levels=(1, 2)) + l_c = LookupFactor("c", force_categorical=True, contrast="CONTRAST", levels=(1, 2)) box = l_c.eval({}, {"c": [1, 1, 2]}) assert box.data == [1, 1, 2] assert box.contrast == "CONTRAST" assert box.levels == (1, 2) import pytest + pytest.raises(ValueError, LookupFactor, "nc", contrast="CONTRAST") pytest.raises(ValueError, LookupFactor, "nc", levels=(1, 2)) diff --git a/patsy/util.py b/patsy/util.py index 8a855ef..2c1c19d 100644 --- a/patsy/util.py +++ b/patsy/util.py @@ -4,24 +4,32 @@ # Some generic utilities. -__all__ = ["atleast_2d_column_default", "uniqueify_list", - "widest_float", "widest_complex", "wide_dtype_for", "widen", - "repr_pretty_delegate", "repr_pretty_impl", - "SortAnythingKey", "safe_scalar_isnan", "safe_isnan", - "iterable", - "have_pandas", - "have_pandas_categorical", - "have_pandas_categorical_dtype", - "pandas_Categorical_from_codes", - "pandas_Categorical_categories", - "pandas_Categorical_codes", - "safe_is_pandas_categorical_dtype", - "safe_is_pandas_categorical", - "safe_issubdtype", - "no_pickling", - "assert_no_pickling", - "safe_string_eq", - ] +__all__ = [ + "atleast_2d_column_default", + "uniqueify_list", + "widest_float", + "widest_complex", + "wide_dtype_for", + "widen", + "repr_pretty_delegate", + "repr_pretty_impl", + "SortAnythingKey", + "safe_scalar_isnan", + "safe_isnan", + "iterable", + "have_pandas", + "have_pandas_categorical", + "have_pandas_categorical_dtype", + "pandas_Categorical_from_codes", + "pandas_Categorical_categories", + "pandas_Categorical_codes", + "safe_is_pandas_categorical_dtype", + "safe_is_pandas_categorical", + "safe_issubdtype", + "no_pickling", + "assert_no_pickling", + "safe_string_eq", +] import sys from io import StringIO @@ -39,17 +47,22 @@ # Pandas versions < 0.9.0 don't have Categorical # Can drop this guard whenever we drop support for such older versions of # pandas. -have_pandas_categorical = (have_pandas and hasattr(pandas, "Categorical")) +have_pandas_categorical = have_pandas and hasattr(pandas, "Categorical") if not have_pandas: _pandas_is_categorical_dtype = None else: if hasattr(pandas, "CategoricalDtype"): # pandas >= 0.25 - _pandas_is_categorical_dtype = lambda x: isinstance(getattr(x, "dtype", x), pandas.CategoricalDtype) + _pandas_is_categorical_dtype = lambda x: isinstance( + getattr(x, "dtype", x), pandas.CategoricalDtype + ) elif hasattr(pandas, "api"): # pandas >= 0.19 - _pandas_is_categorical_dtype = getattr(pandas.api.types, "is_categorical_dtype", None) + _pandas_is_categorical_dtype = getattr( + pandas.api.types, "is_categorical_dtype", None + ) else: # pandas <=0.18 - _pandas_is_categorical_dtype = getattr(pandas.core.common, - "is_categorical_dtype", None) + _pandas_is_categorical_dtype = getattr( + pandas.core.common, "is_categorical_dtype", None + ) have_pandas_categorical_dtype = _pandas_is_categorical_dtype is not None # The handling of the `copy` keyword has been changed since numpy>=2. @@ -78,13 +91,14 @@ def asarray_or_pandas(a, copy=copy_if_needed, dtype=None, subok=False): def test_asarray_or_pandas(): import warnings + assert type(asarray_or_pandas([1, 2, 3])) is np.ndarray with warnings.catch_warnings() as w: - warnings.filterwarnings('ignore', 'the matrix subclass', - PendingDeprecationWarning) + warnings.filterwarnings( + "ignore", "the matrix subclass", PendingDeprecationWarning + ) assert type(asarray_or_pandas(np.matrix([[1, 2, 3]]))) is np.ndarray - assert type(asarray_or_pandas( - np.matrix([[1, 2, 3]]), subok=True)) is np.matrix + assert type(asarray_or_pandas(np.matrix([[1, 2, 3]]), subok=True)) is np.matrix assert w is None a = np.array([1, 2, 3]) assert asarray_or_pandas(a) is a @@ -92,8 +106,7 @@ def test_asarray_or_pandas(): assert np.array_equal(a, a_copy) a_copy[0] = 100 assert not np.array_equal(a, a_copy) - assert np.allclose(asarray_or_pandas([1, 2, 3], dtype=float), - [1.0, 2.0, 3.0]) + assert np.allclose(asarray_or_pandas([1, 2, 3], dtype=float), [1.0, 2.0, 3.0]) assert asarray_or_pandas([1, 2, 3], dtype=float).dtype == np.dtype(float) a_view = asarray_or_pandas(a, dtype=a.dtype) a_view[0] = 99 @@ -119,9 +132,7 @@ def test_asarray_or_pandas(): s_view2[10] = 99 assert s[10] == 99 - df = pandas.DataFrame([[1, 2, 3]], - columns=["A", "B", "C"], - index=[10]) + df = pandas.DataFrame([[1, 2, 3]], columns=["A", "B", "C"], index=[10]) df_view1 = asarray_or_pandas(df) df_view1.loc[10, "A"] = 101 assert np.array_equal(df_view1.columns, ["A", "B", "C"]) @@ -150,13 +161,12 @@ def test_asarray_or_pandas(): had_pandas = have_pandas try: have_pandas = False - assert (type(asarray_or_pandas(pandas.Series([1, 2, 3]))) - is np.ndarray) - assert (type(asarray_or_pandas(pandas.DataFrame([[1, 2, 3]]))) - is np.ndarray) + assert type(asarray_or_pandas(pandas.Series([1, 2, 3]))) is np.ndarray + assert type(asarray_or_pandas(pandas.DataFrame([[1, 2, 3]]))) is np.ndarray finally: have_pandas = had_pandas + # Like np.atleast_2d, but this converts lower-dimensional arrays into columns, # instead of rows. It also converts ndarray subclasses into basic ndarrays, # which makes it easier to guarantee correctness. However, there are many @@ -179,6 +189,7 @@ def atleast_2d_column_default(a, preserve_pandas=False): def test_atleast_2d_column_default(): import warnings + assert np.all(atleast_2d_column_default([1, 2, 3]) == [[1], [2], [3]]) assert atleast_2d_column_default(1).shape == (1, 1) @@ -190,51 +201,72 @@ def test_atleast_2d_column_default(): assert atleast_2d_column_default([[1], [2], [3]]).shape == (3, 1) with warnings.catch_warnings() as w: - warnings.filterwarnings('ignore', 'the matrix subclass', - PendingDeprecationWarning) + warnings.filterwarnings( + "ignore", "the matrix subclass", PendingDeprecationWarning + ) assert type(atleast_2d_column_default(np.matrix(1))) == np.ndarray assert w is None global have_pandas if have_pandas: - assert (type(atleast_2d_column_default(pandas.Series([1, 2]))) - == np.ndarray) - assert (type(atleast_2d_column_default(pandas.DataFrame([[1], [2]]))) - == np.ndarray) - assert (type(atleast_2d_column_default(pandas.Series([1, 2]), - preserve_pandas=True)) - == pandas.DataFrame) - assert (type(atleast_2d_column_default(pandas.DataFrame([[1], [2]]), - preserve_pandas=True)) - == pandas.DataFrame) + assert type(atleast_2d_column_default(pandas.Series([1, 2]))) == np.ndarray + assert ( + type(atleast_2d_column_default(pandas.DataFrame([[1], [2]]))) == np.ndarray + ) + assert ( + type(atleast_2d_column_default(pandas.Series([1, 2]), preserve_pandas=True)) + == pandas.DataFrame + ) + assert ( + type( + atleast_2d_column_default( + pandas.DataFrame([[1], [2]]), preserve_pandas=True + ) + ) + == pandas.DataFrame + ) s = pandas.Series([10, 11, 12], name="hi", index=["a", "b", "c"]) df = atleast_2d_column_default(s, preserve_pandas=True) assert isinstance(df, pandas.DataFrame) assert np.all(df.columns == ["hi"]) assert np.all(df.index == ["a", "b", "c"]) with warnings.catch_warnings() as w: - warnings.filterwarnings('ignore', 'the matrix subclass', - PendingDeprecationWarning) - assert (type(atleast_2d_column_default(np.matrix(1), - preserve_pandas=True)) - == np.ndarray) + warnings.filterwarnings( + "ignore", "the matrix subclass", PendingDeprecationWarning + ) + assert ( + type(atleast_2d_column_default(np.matrix(1), preserve_pandas=True)) + == np.ndarray + ) assert w is None - assert (type(atleast_2d_column_default([1, 2, 3], preserve_pandas=True)) - == np.ndarray) + assert ( + type(atleast_2d_column_default([1, 2, 3], preserve_pandas=True)) == np.ndarray + ) if have_pandas: had_pandas = have_pandas try: have_pandas = False - assert (type(atleast_2d_column_default(pandas.Series([1, 2]), - preserve_pandas=True)) - == np.ndarray) - assert (type(atleast_2d_column_default(pandas.DataFrame([[1], [2]]), - preserve_pandas=True)) - == np.ndarray) + assert ( + type( + atleast_2d_column_default( + pandas.Series([1, 2]), preserve_pandas=True + ) + ) + == np.ndarray + ) + assert ( + type( + atleast_2d_column_default( + pandas.DataFrame([[1], [2]]), preserve_pandas=True + ) + ) + == np.ndarray + ) finally: have_pandas = had_pandas + # A version of .reshape() that knows how to down-convert a 1-column # pandas.DataFrame into a pandas.Series. Useful for code that wants to be # agnostic between 1d and 2d data, with the pattern: @@ -254,15 +286,19 @@ def pandas_friendly_reshape(a, new_shape): if new_shape[0] != a.shape[0]: raise ValueError("arrays have incompatible sizes") return a[a.columns[0]] - raise ValueError("cannot reshape a DataFrame with shape %s to shape %s" - % (a.shape, new_shape)) + raise ValueError( + "cannot reshape a DataFrame with shape %s to shape %s" % (a.shape, new_shape) + ) + def test_pandas_friendly_reshape(): import pytest + global have_pandas - assert np.allclose(pandas_friendly_reshape(np.arange(10).reshape(5, 2), - (2, 5)), - np.arange(10).reshape(2, 5)) + assert np.allclose( + pandas_friendly_reshape(np.arange(10).reshape(5, 2), (2, 5)), + np.arange(10).reshape(2, 5), + ) if have_pandas: df = pandas.DataFrame({"x": [1, 2, 3]}, index=["a", "b", "c"]) noop = pandas_friendly_reshape(df, (3, 1)) @@ -287,6 +323,7 @@ def test_pandas_friendly_reshape(): finally: have_pandas = had_pandas + def uniqueify_list(seq): seq_new = [] seen = set() @@ -296,46 +333,54 @@ def uniqueify_list(seq): seen.add(obj) return seq_new + def test_to_uniqueify_list(): assert uniqueify_list([1, 2, 3]) == [1, 2, 3] assert uniqueify_list([1, 3, 3, 2, 3, 1]) == [1, 3, 2] assert uniqueify_list([3, 2, 1, 4, 1, 2, 3]) == [3, 2, 1, 4] + for float_type in ("float128", "float96", "float64"): if hasattr(np, float_type): widest_float = getattr(np, float_type) break -else: # pragma: no cover +else: # pragma: no cover assert False for complex_type in ("complex256", "complex196", "complex128"): if hasattr(np, complex_type): widest_complex = getattr(np, complex_type) break -else: # pragma: no cover +else: # pragma: no cover assert False + def wide_dtype_for(arr): arr = np.asarray(arr) - if (safe_issubdtype(arr.dtype, np.integer) - or safe_issubdtype(arr.dtype, np.floating)): + if safe_issubdtype(arr.dtype, np.integer) or safe_issubdtype( + arr.dtype, np.floating + ): return widest_float elif safe_issubdtype(arr.dtype, np.complexfloating): return widest_complex raise ValueError("cannot widen a non-numeric type %r" % (arr.dtype,)) + def widen(arr): return np.asarray(arr, dtype=wide_dtype_for(arr)) + def test_wide_dtype_for_and_widen(): assert np.allclose(widen([1, 2, 3]), [1, 2, 3]) assert widen([1, 2, 3]).dtype == widest_float assert np.allclose(widen([1.0, 2.0, 3.0]), [1, 2, 3]) assert widen([1.0, 2.0, 3.0]).dtype == widest_float - assert np.allclose(widen([1+0j, 2, 3]), [1, 2, 3]) - assert widen([1+0j, 2, 3]).dtype == widest_complex + assert np.allclose(widen([1 + 0j, 2, 3]), [1, 2, 3]) + assert widen([1 + 0j, 2, 3]).dtype == widest_complex import pytest + pytest.raises(ValueError, widen, ["hi"]) + class PushbackAdapter(object): def __init__(self, it): self._it = it @@ -353,6 +398,7 @@ def next(self): else: # May raise StopIteration return next(self._it) + __next__ = next def peek(self): @@ -371,6 +417,7 @@ def has_more(self): else: return True + def test_PushbackAdapter(): it = PushbackAdapter(iter([1, 2, 3, 4])) assert it.has_more() @@ -387,6 +434,7 @@ def test_PushbackAdapter(): assert list(it) == [20, 10, 3, 4] assert not it.has_more() + # The IPython pretty-printer gives very nice output that is difficult to get # otherwise, e.g., look how much more readable this is than if it were all # smooshed onto one line: @@ -407,6 +455,7 @@ def test_PushbackAdapter(): # Pretty printer docs: # http://ipython.org/ipython-doc/dev/api/generated/IPython.lib.pretty.html + class _MiniPPrinter(object): def __init__(self): self._out = StringIO() @@ -433,10 +482,12 @@ def pretty(self, obj): def getvalue(self): return self._out.getvalue() + def _mini_pretty(obj): - printer = _MiniPPrinter() - printer.pretty(obj) - return printer.getvalue() + printer = _MiniPPrinter() + printer.pretty(obj) + return printer.getvalue() + def repr_pretty_delegate(obj): # If IPython is already loaded, then might as well use it. (Most commonly @@ -453,19 +504,23 @@ def repr_pretty_delegate(obj): # in their test suite (see patsy bug #12). if optional_dep_ok and "IPython" in sys.modules: from IPython.lib.pretty import pretty + return pretty(obj) else: return _mini_pretty(obj) + def repr_pretty_impl(p, obj, args, kwargs=[]): name = obj.__class__.__name__ p.begin_group(len(name) + 1, "%s(" % (name,)) started = [False] + def new_item(): if started[0]: p.text(",") p.breakable() started[0] = True + for arg in args: new_item() p.pretty(arg) @@ -476,15 +531,18 @@ def new_item(): p.end_group(len(label) + 1, "") p.end_group(len(name) + 1, ")") + def test_repr_pretty(): assert repr_pretty_delegate("asdf") == "'asdf'" printer = _MiniPPrinter() + class MyClass(object): pass - repr_pretty_impl(printer, MyClass(), - ["a", 1], [("foo", "bar"), ("asdf", "asdf")]) + + repr_pretty_impl(printer, MyClass(), ["a", 1], [("foo", "bar"), ("asdf", "asdf")]) assert printer.getvalue() == "MyClass('a', 1, foo='bar', asdf='asdf')" + # In Python 3, objects of different types are not generally comparable, so a # list of heterogeneous types cannot be sorted. This implements a Python 2 # style comparison for arbitrary types. (It works on Python 2 too, but just @@ -537,25 +595,38 @@ def __lt__(self, other): if self.obj == other.obj: return False # Otherwise, we break ties based on class name and memory position - return ((self.obj.__class__.__name__, id(self.obj)) - < (other.obj.__class__.__name__, id(other.obj))) + return (self.obj.__class__.__name__, id(self.obj)) < ( + other.obj.__class__.__name__, + id(other.obj), + ) + def test_SortAnythingKey(): assert sorted([20, 10, 0, 15], key=SortAnythingKey) == [0, 10, 15, 20] assert sorted([10, -1.5], key=SortAnythingKey) == [-1.5, 10] assert sorted([10, "a", 20.5, "b"], key=SortAnythingKey) == [10, 20.5, "a", "b"] + class a(object): pass + class b(object): pass + class z(object): pass + a_obj = a() b_obj = b() z_obj = z() o_obj = object() - assert (sorted([z_obj, a_obj, 1, b_obj, o_obj], key=SortAnythingKey) - == [1, a_obj, b_obj, o_obj, z_obj]) + assert sorted([z_obj, a_obj, 1, b_obj, o_obj], key=SortAnythingKey) == [ + 1, + a_obj, + b_obj, + o_obj, + z_obj, + ] + # NaN checking functions that work on arbitrary objects, on old Python # versions (math.isnan is only in 2.6+), etc. @@ -564,8 +635,11 @@ def safe_scalar_isnan(x): return np.isnan(float(x)) except (TypeError, ValueError, NotImplementedError): return False + + safe_isnan = np.vectorize(safe_scalar_isnan, otypes=[bool]) + def test_safe_scalar_isnan(): assert not safe_scalar_isnan(True) assert not safe_scalar_isnan(None) @@ -577,15 +651,18 @@ def test_safe_scalar_isnan(): assert safe_scalar_isnan(np.float32(np.nan)) assert safe_scalar_isnan(float(np.nan)) + def test_safe_isnan(): - assert np.array_equal(safe_isnan([1, True, None, np.nan, "asdf"]), - [False, False, False, True, False]) + assert np.array_equal( + safe_isnan([1, True, None, np.nan, "asdf"]), [False, False, False, True, False] + ) assert safe_isnan(np.nan).ndim == 0 assert safe_isnan(np.nan) assert not safe_isnan(None) # raw isnan raises a *different* error for strings than for objects: assert not safe_isnan("asdf") + def iterable(obj): try: iter(obj) @@ -593,6 +670,7 @@ def iterable(obj): return False return True + def test_iterable(): assert iterable("asdf") assert iterable([]) @@ -600,6 +678,7 @@ def test_iterable(): assert not iterable(1) assert not iterable(iterable) + ##### Handling Pandas's categorical stuff is horrible and hateful # Basically they decided that they didn't like how numpy does things, so their @@ -616,6 +695,7 @@ def test_iterable(): # Also there are hoops to jump through to handle both the old style # (Categorical objects) and new-style (Series with dtype="category"). + # Needed to support pandas < 0.15 def pandas_Categorical_from_codes(codes, categories): assert have_pandas_categorical @@ -628,6 +708,7 @@ def pandas_Categorical_from_codes(codes, categories): else: return pandas.Categorical(codes, categories) + def test_pandas_Categorical_from_codes(): if not have_pandas_categorical: return @@ -635,6 +716,7 @@ def test_pandas_Categorical_from_codes(): assert np.all(np.asarray(c)[:-1] == ["b", "b", "a"]) assert np.isnan(np.asarray(c)[-1]) + # Needed to support pandas < 0.15 def pandas_Categorical_categories(cat): # In 0.15+, a categorical Series has a .cat attribute which is similar to @@ -647,6 +729,7 @@ def pandas_Categorical_categories(cat): else: return cat.levels + # Needed to support pandas < 0.15 def pandas_Categorical_codes(cat): # In 0.15+, a categorical Series has a .cat attribute which is a @@ -659,6 +742,7 @@ def pandas_Categorical_codes(cat): else: return cat.labels + def test_pandas_Categorical_accessors(): if not have_pandas_categorical: return @@ -671,12 +755,14 @@ def test_pandas_Categorical_accessors(): assert np.all(pandas_Categorical_categories(s) == ["a", "b"]) assert np.all(pandas_Categorical_codes(s) == [1, 1, 0, -1]) + # Needed to support pandas >= 0.15 (!) def safe_is_pandas_categorical_dtype(dt): if not have_pandas_categorical_dtype: return False return _pandas_is_categorical_dtype(dt) + # Needed to support pandas >= 0.15 (!) def safe_is_pandas_categorical(data): if not have_pandas_categorical: @@ -687,6 +773,7 @@ def safe_is_pandas_categorical(data): return safe_is_pandas_categorical_dtype(data.dtype) return False + def test_safe_is_pandas_categorical(): assert not safe_is_pandas_categorical(np.arange(10)) @@ -698,6 +785,7 @@ def test_safe_is_pandas_categorical(): s_obj = pandas.Series(["a", "b"], dtype="category") assert safe_is_pandas_categorical(s_obj) + # Needed to support pandas >= 0.15 (!) # Calling np.issubdtype on a pandas categorical will blow up -- the officially # recommended solution is to replace every piece of code like @@ -715,6 +803,7 @@ def safe_issubdtype(dt1, dt2): return False return np.issubdtype(dt1, dt2) + def test_safe_issubdtype(): assert safe_issubdtype(int, np.integer) assert safe_issubdtype(np.dtype(float), np.floating) @@ -725,17 +814,22 @@ def test_safe_issubdtype(): bad_dtype = pandas.Series(["a", "b"], dtype="category") assert not safe_issubdtype(bad_dtype, np.integer) + def no_pickling(*args, **kwargs): raise NotImplementedError( "Sorry, pickling not yet supported. " "See https://github.com/pydata/patsy/issues/26 if you want to " - "help.") + "help." + ) + def assert_no_pickling(obj): import pickle import pytest + pytest.raises(NotImplementedError, pickle.dumps, obj) + # Use like: # if safe_string_eq(constraints, "center"): # ... @@ -747,6 +841,7 @@ def safe_string_eq(obj, value): else: return False + def test_safe_string_eq(): assert safe_string_eq("foo", "foo") assert not safe_string_eq("foo", "bar") diff --git a/setup.py b/setup.py index 89eb46d..e1b63c7 100644 --- a/setup.py +++ b/setup.py @@ -2,8 +2,10 @@ from setuptools import setup -DESC = ("A Python package for describing statistical models and for " - "building design matrices.") +DESC = ( + "A Python package for describing statistical models and for " + "building design matrices." +) LONG_DESC = open("README.md").read() @@ -12,7 +14,7 @@ setup( name="patsy", - version=__version__, + version=__version__, # noqa: F821 description=DESC, long_description=LONG_DESC, long_description_content_type="text/markdown", @@ -27,24 +29,24 @@ "numpy >= 1.4", ], extras_require={ - "test": ["pytest", "pytest-cov", "scipy"], + "test": ["pytest", "pytest-cov", "scipy"], }, - python_requires='>=3.6', + python_requires=">=3.6", classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Intended Audience :: Financial and Insurance Industry", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Topic :: Scientific/Engineering", + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: Financial and Insurance Industry", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering", ], )