Skip to content

Commit

Permalink
Merge pull request #20 from st-tech/feat/update-version-to-0.3.2
Browse files Browse the repository at this point in the history
update version to 0.3.2
  • Loading branch information
usaito authored Nov 7, 2020
2 parents 1e1eb9f + f09f218 commit 0667738
Show file tree
Hide file tree
Showing 16 changed files with 177 additions and 171 deletions.
1 change: 1 addition & 0 deletions docs/obp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dataset module
obp.dataset.base
obp.dataset.real
obp.dataset.synthetic
obp.dataset.multiclass


simulator module
Expand Down
65 changes: 34 additions & 31 deletions examples/quickstart/quickstart.ipynb

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions examples/quickstart/quickstart_synthetic.ipynb

Large diffs are not rendered by default.

24 changes: 13 additions & 11 deletions obp/dataset/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ class MultiClassToBanditReduction(BaseSyntheticBanditDataset):
A machine learning classifier such as logistic regression is used to construct behavior and evaluation policies as follows.
1. Split the original data into training (:math:`\\mathcal{D}_{\\mathrm{tr}}`) and evaluation (:math:`\\mathcal{D}_{\\mathrm{ev}}`) sets.
2. Train classifiers on :math:`\\mathcal{D}_{\\mathrm{tr}}` and regard them as base deterministic policies :math:`\\pi_{\\mathrm{det},b}` and :math:`\\pi_{\\mathrm{det},e}`.
3. Construct behavior (:math:`\\pi_{b}`) and evaluation (:math:`\\pi_{e}`) policies based on :math:`\\pi_{\\mathrm{det}}` as
2. Train classifiers on :math:`\\mathcal{D}_{\\mathrm{tr}}` and obtain base deterministic policies :math:`\\pi_{\\mathrm{det},b}` and :math:`\\pi_{\\mathrm{det},e}`.
3. Construct behavior (:math:`\\pi_{b}`) and evaluation (:math:`\\pi_{e}`) policies based on :math:`\\pi_{\\mathrm{det},b}` and :math:`\\pi_{\\mathrm{det},e}` as
.. math::
\\pi_b (a | x) := \\alpha_b \\pi_{\\mathrm{det},b} (a|x) + (1.0 - \\alpha_b) \\pi_{u} (a|x)
\\pi_b (a | x) := \\alpha_b \\cdot \\pi_{\\mathrm{det},b} (a|x) + (1.0 - \\alpha_b) \\cdot \\pi_{u} (a|x)
.. math::
\\pi_e (a | x) := \\alpha_e \\pi_{\\mathrm{det},e} (a|x) + (1.0 - \\alpha_e) \\pi_{u} (a|x)
\\pi_e (a | x) := \\alpha_e \\cdot \\pi_{\\mathrm{det},e} (a|x) + (1.0 - \\alpha_e) \\cdot \\pi_{u} (a|x)
where :math:`\\pi_{u}` is a uniform random policy and :math:`\\alpha_b` and :math:`\\alpha_e` are set by the user.
Expand All @@ -60,11 +60,11 @@ class MultiClassToBanditReduction(BaseSyntheticBanditDataset):
base_classifier_b: ClassifierMixin
Machine learning classifier used to construct a behavior policy.
alpha_b: float, default: 0.9
alpha_b: float, default=0.9
Ration of a uniform random policy when constructing a **behavior** policy.
Must be in the [0, 1) interval to make the behavior policy a stochastic one.
dataset_name: str, default: None
dataset_name: str, default=None
Name of the dataset.
Examples
Expand Down Expand Up @@ -187,7 +187,7 @@ def split_train_eval(
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the evaluation split.
If int, represents the absolute number of test samples.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in train-evaluation split.
"""
Expand All @@ -213,12 +213,12 @@ def obtain_batch_bandit_feedback(
Please call `self.split_train_eval()` before calling this method.
Parameters
----------
-----------
eval_size: float or int, default=0.25
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
If int, represents the absolute number of test samples.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in sampling actions.
Returns
Expand Down Expand Up @@ -261,10 +261,12 @@ def obtain_action_dist_by_eval_policy(
) -> np.ndarray:
"""Obtain action choice probabilities by an evaluation policy.
base_classifier_e: ClassifierMixin, default: None
Parameters
-----------
base_classifier_e: ClassifierMixin, default=None
Machine learning classifier used to construct a behavior policy.
alpha_e: float, default: 1.0
alpha_e: float, default=1.0
Ration of a uniform random policy when constructing an **evaluation** policy.
Must be in the [0, 1] interval (evaluation policy can be deterministic).
Expand Down
14 changes: 7 additions & 7 deletions obp/dataset/real.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ class OpenBanditDataset(BaseRealBanditDataset):
campaign: str
One of the three possible campaigns considered in ZOZOTOWN, "all", "men", and "women".
data_path: Path, default: Path('./obd')
data_path: Path, default=Path('./obd')
Path that stores Open Bandit Dataset.
dataset_name: str, default: 'obd'
dataset_name: str, default='obd'
Name of the dataset.
References
Expand Down Expand Up @@ -109,13 +109,13 @@ def calc_on_policy_policy_value_estimate(
campaign: str
One of the three possible campaigns considered in ZOZOTOWN (i.e., "all", "men", and "women").
data_path: Path, default: Path('./obd')
data_path: Path, default=Path('./obd')
Path that stores Open Bandit Dataset.
test_size: float, default=0.3
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
is_timeseries_split: bool, default: False
is_timeseries_split: bool, default=False
If true, split the original logged badnit feedback data by time series.
Returns
Expand Down Expand Up @@ -178,7 +178,7 @@ def obtain_batch_bandit_feedback(
test_size: float, default=0.3
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
is_timeseries_split: bool, default: False
is_timeseries_split: bool, default=False
If true, split the original logged badnit feedback data by time series.
Returns
Expand Down Expand Up @@ -233,10 +233,10 @@ def sample_bootstrap_bandit_feedback(
test_size: float, default=0.3
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
is_timeseries_split: bool, default: False
is_timeseries_split: bool, default=False
If true, split the original logged badnit feedback data by time series.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in sampling logged bandit dataset.
Returns
Expand Down
18 changes: 9 additions & 9 deletions obp/dataset/synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,29 @@ class SyntheticBanditDataset(BaseSyntheticBanditDataset):
n_actions: int
Number of actions.
dim_context: int, default: 1
dim_context: int, default=1
Number of dimensions of context vectors.
reward_type: str, default: 'binary'
reward_type: str, default='binary'
Type of reward variable, must be either 'binary' or 'continuous'.
When 'binary' is given, rewards are sampled from the Bernoulli distribution.
When 'continuous' is given, rewards are sampled from the truncated Normal distribution with `scale=1`.
reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray]], default: None
reward_function: Callable[[np.ndarray, np.ndarray], np.ndarray]], default=None
Function generating expected reward with context and action context vectors,
i.e., :math:`\\mu: \\mathcal{X} \\times \\mathcal{A} \\rightarrow \\mathbb{R}`.
If None is set, context **independent** expected reward for each action will be
sampled from the uniform distribution automatically.
behavior_policy_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default: None
behavior_policy_function: Callable[[np.ndarray, np.ndarray], np.ndarray], default=None
Function generating probability distribution over action space,
i.e., :math:`\\pi: \\mathcal{X} \\rightarrow \\Delta(\\mathcal{A})`.
If None is set, context **independent** uniform distribution will be used (uniform random behavior policy).
random_state: int, default: None
random_state: int, default=None
Controls the random seed in sampling synthetic bandit dataset.
dataset_name: str, default: 'synthetic_bandit_dataset'
dataset_name: str, default='synthetic_bandit_dataset'
Name of the dataset.
Examples
Expand Down Expand Up @@ -252,7 +252,7 @@ def logistic_reward_function(
action_context: array-like, shape (n_actions, dim_action_context)
Vector representation for each action.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in sampling dataset.
Returns
Expand Down Expand Up @@ -292,7 +292,7 @@ def linear_reward_function(
action_context: array-like, shape (n_actions, dim_action_context)
Vector representation for each action.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in sampling dataset.
Returns
Expand Down Expand Up @@ -332,7 +332,7 @@ def linear_behavior_policy(
action_context: array-like, shape (n_actions, dim_action_context)
Vector representation for each action.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in sampling dataset.
Returns
Expand Down
46 changes: 23 additions & 23 deletions obp/ope/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class ReplayMethod(BaseOffPolicyEstimator):
Parameters
----------
estimator_name: str, default: 'rm'.
estimator_name: str, default='rm'.
Name of off-policy estimator.
References
Expand Down Expand Up @@ -151,13 +151,13 @@ def estimate_interval(
position: array-like, shape (n_rounds,)
Positions of each round in the given logged bandit feedback.
alpha: float, default: 0.05
alpha: float, default=0.05
P-value.
n_bootstrap_samples: int, default: 10000
n_bootstrap_samples: int, default=10000
Number of resampling performed in the bootstrap procedure.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in bootstrap sampling.
Returns
Expand Down Expand Up @@ -197,7 +197,7 @@ class InverseProbabilityWeighting(BaseOffPolicyEstimator):
Parameters
------------
estimator_name: str, default: 'ipw'.
estimator_name: str, default='ipw'.
Name of off-policy estimator.
References
Expand Down Expand Up @@ -320,13 +320,13 @@ def estimate_interval(
Distribution over actions or the action choice probabilities
by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a|x)`.
alpha: float, default: 0.05
alpha: float, default=0.05
P-value.
n_bootstrap_samples: int, default: 10000
n_bootstrap_samples: int, default=10000
Number of resampling performed in the bootstrap procedure.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in bootstrap sampling.
Returns
Expand Down Expand Up @@ -372,7 +372,7 @@ class SelfNormalizedInverseProbabilityWeighting(InverseProbabilityWeighting):
Parameters
----------
estimator_name: str, default: 'snipw'.
estimator_name: str, default='snipw'.
Name of off-policy estimator.
References
Expand Down Expand Up @@ -451,7 +451,7 @@ class DirectMethod(BaseOffPolicyEstimator):
Parameters
----------
estimator_name: str, default: 'dm'.
estimator_name: str, default='dm'.
Name of off-policy estimator.
References
Expand Down Expand Up @@ -554,13 +554,13 @@ def estimate_interval(
estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
Estimated rewards for each round, action, and position by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
alpha: float, default: 0.05
alpha: float, default=0.05
P-value.
n_bootstrap_samples: int, default: 10000
n_bootstrap_samples: int, default=10000
Number of resampling performed in the bootstrap procedure.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in bootstrap sampling.
Returns
Expand Down Expand Up @@ -611,7 +611,7 @@ class DoublyRobust(InverseProbabilityWeighting):
Parameters
----------
estimator_name: str, default: 'dr'.
estimator_name: str, default='dr'.
Name of off-policy estimator.
References
Expand Down Expand Up @@ -760,13 +760,13 @@ def estimate_interval(
estimated_rewards_by_reg_model: array-like, shape (n_rounds, n_actions, len_list)
Estimated rewards for each round, action, and position by regression model, i.e., :math:`\\hat{q}(x_t,a_t)`.
alpha: float, default: 0.05
alpha: float, default=0.05
P-value.
n_bootstrap_samples: int, default: 10000
n_bootstrap_samples: int, default=10000
Number of resampling performed in the bootstrap procedure.
random_state: int, default: None
random_state: int, default=None
Controls the random seed in bootstrap sampling.
Returns
Expand Down Expand Up @@ -815,7 +815,7 @@ class SelfNormalizedDoublyRobust(DoublyRobust):
Parameters
----------
estimator_name: str, default: 'sndr'.
estimator_name: str, default='sndr'.
Name of off-policy estimator.
References
Expand Down Expand Up @@ -906,11 +906,11 @@ class SwitchInverseProbabilityWeighting(DoublyRobust):
Parameters
----------
tau: float, default: 1
tau: float, default=1
Switching hyperparameter. When importance weight is larger than this parameter, the DM estimator is applied, otherwise the IPW estimator is applied.
This hyperparameter should be larger than 1., otherwise it is meaningless.
estimator_name: str, default: 'switch-ipw'.
estimator_name: str, default='switch-ipw'.
Name of off-policy estimator.
References
Expand Down Expand Up @@ -1007,11 +1007,11 @@ class SwitchDoublyRobust(DoublyRobust):
Parameters
----------
tau: float, default: 1
tau: float, default=1
Switching hyperparameter. When importance weight is larger than this parameter, the DM estimator is applied, otherwise the DR estimator is applied.
This hyperparameter should be larger than 0., otherwise it is meaningless.
estimator_name: str, default: 'switch-dr'.
estimator_name: str, default='switch-dr'.
Name of off-policy estimator.
References
Expand Down Expand Up @@ -1127,7 +1127,7 @@ class DoublyRobustWithShrinkage(DoublyRobust):
lambda_: float
Shrinkage hyperparameter. This hyperparameter should be larger than 0., otherwise it is meaningless.
estimator_name: str, default: 'dr-os'.
estimator_name: str, default='dr-os'.
Name of off-policy estimator.
References
Expand Down
Loading

0 comments on commit 0667738

Please sign in to comment.