From 6449a43261eeea82185e07dd659a79afff9556ab Mon Sep 17 00:00:00 2001
From: Jiayi Weng <trinkle23897@gmail.com>
Date: Tue, 26 Sep 2023 08:24:08 -0700
Subject: [PATCH] Fix documentation build (#951)

Close #941
rtfd build link:
https://readthedocs.org/projects/tianshou/builds/22019877/

Also -- fix two small issues reported by users, see #928 and #930

Note: I created the branch in thu-ml:tianshou instead of
Trinkle23897:tianshou to quickly check the rtfd build. It's not a good
process since every commit would trigger twice CI pipelines :(
---
 .readthedocs.yaml            | 24 ++++++++++++++++++++++++
 README.md                    | 22 ++++++++++++++--------
 docs/index.rst               |  2 +-
 docs/requirements.txt        |  3 +--
 docs/tutorials/dqn.rst       | 16 +++++++++++-----
 examples/inverse/irl_gail.py | 13 +++++--------
 6 files changed, 56 insertions(+), 24 deletions(-)
 create mode 100644 .readthedocs.yaml

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 000000000..6d1ba8be9
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,24 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+  jobs:
+    pre_build:
+      - pip install .
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+# We recommend specifying your dependencies to enable reproducible builds:
+# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/requirements.txt
diff --git a/README.md b/README.md
index 536ddbdda..006951288 100644
--- a/README.md
+++ b/README.md
@@ -8,9 +8,7 @@
 
 > ⚠️️ **Transition to Gymnasium**: The maintainers of OpenAI Gym have recently released [Gymnasium](http://github.com/Farama-Foundation/Gymnasium), 
 > which is where future maintenance of OpenAI Gym will be taking place. 
-> Tianshou has transitioned to internally using Gymnasium environments. You can still use OpenAI Gym environments with
-> Tianshou vector environments, but they will be wrapped in a compatibility layer, which could be a source of issues.
-> We recommend that you update your environment code to Gymnasium. If you want to continue using OpenAI Gym with
+> Tianshou has transitioned to internally using Gymnasium environments. If you want to continue using OpenAI Gym with
 > Tianshou, you need to manually install Gym and [Shimmy](https://github.com/Farama-Foundation/Shimmy) (the compatibility layer).
 
 **Tianshou** ([天授](https://baike.baidu.com/item/%E5%A4%A9%E6%8E%88)) is a reinforcement learning platform based on pure PyTorch. Unlike existing reinforcement learning libraries, which are mainly based on TensorFlow, have many nested classes, unfriendly API, or slow-speed, Tianshou provides a fast-speed modularized framework and pythonic API for building the deep reinforcement learning agent with the least number of lines of code. The supported interface algorithms currently include:
@@ -69,7 +67,7 @@ In Chinese, Tianshou means divinely ordained and is derived to the gift of being
 
 ## Installation
 
-Tianshou is currently hosted on [PyPI](https://pypi.org/project/tianshou/) and [conda-forge](https://github.com/conda-forge/tianshou-feedstock). It requires Python >= 3.8.
+Tianshou is currently hosted on [PyPI](https://pypi.org/project/tianshou/) and [conda-forge](https://github.com/conda-forge/tianshou-feedstock). It requires Python >= 3.11.
 
 You can simply install Tianshou from PyPI with the following command:
 
@@ -234,13 +232,21 @@ test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)  #
 Let's train it:
 
 ```python
-result = ts.trainer.offpolicy_trainer(
-    policy, train_collector, test_collector, epoch, step_per_epoch, step_per_collect,
-    test_num, batch_size, update_per_step=1 / step_per_collect,
+result = ts.trainer.OffpolicyTrainer(
+    policy=policy,
+    train_collector=train_collector,
+    test_collector=test_collector,
+    max_epoch=epoch,
+    step_per_epoch=step_per_epoch,
+    step_per_collect=step_per_collect,
+    episode_per_test=test_num,
+    batch_size=batch_size,
+    update_per_step=update_per_step=1 / step_per_collect,
     train_fn=lambda epoch, env_step: policy.set_eps(eps_train),
     test_fn=lambda epoch, env_step: policy.set_eps(eps_test),
     stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold,
-    logger=logger)
+    logger=logger,
+).run()
 print(f'Finished training! Use {result["duration"]}')
 ```
 
diff --git a/docs/index.rst b/docs/index.rst
index 7f557fa1e..758b4d40f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -63,7 +63,7 @@ Here is Tianshou's other features:
 Installation
 ------------
 
-Tianshou is currently hosted on `PyPI <https://pypi.org/project/tianshou/>`_ and `conda-forge <https://github.com/conda-forge/tianshou-feedstock>`_. It requires Python >= 3.8.
+Tianshou is currently hosted on `PyPI <https://pypi.org/project/tianshou/>`_ and `conda-forge <https://github.com/conda-forge/tianshou-feedstock>`_. It requires Python >= 3.11.
 
 You can simply install Tianshou from PyPI with the following command:
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 56a21fd84..c4cb05627 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,6 @@
-gym
 numba
 numpy>=1.20
-sphinx
+sphinx<7
 sphinxcontrib-bibtex
 sphinx_rtd_theme>=0.5.1
 tensorboard
diff --git a/docs/tutorials/dqn.rst b/docs/tutorials/dqn.rst
index b2c5844e2..87c84d647 100644
--- a/docs/tutorials/dqn.rst
+++ b/docs/tutorials/dqn.rst
@@ -181,19 +181,25 @@ The main function of collector is the collect function, which can be summarized
 Train Policy with a Trainer
 ---------------------------
 
-Tianshou provides :func:`~tianshou.trainer.onpolicy_trainer`, :func:`~tianshou.trainer.offpolicy_trainer`, and :func:`~tianshou.trainer.offline_trainer`. The trainer will automatically stop training when the policy reach the stop condition ``stop_fn`` on test collector. Since DQN is an off-policy algorithm, we use the :func:`~tianshou.trainer.offpolicy_trainer` as follows:
+Tianshou provides :class:`~tianshou.trainer.OnpolicyTrainer`, :class:`~tianshou.trainer.OffpolicyTrainer`,
+and :class:`~tianshou.trainer.OfflineTrainer`. The trainer will automatically stop training when the policy
+reaches the stop condition ``stop_fn`` on test collector. Since DQN is an off-policy algorithm, we use the
+:class:`~tianshou.trainer.OffpolicyTrainer` as follows:
 ::
 
-    result = ts.trainer.offpolicy_trainer(
-        policy, train_collector, test_collector,
+    result = ts.trainer.OffpolicyTrainer(
+        policy=policy,
+        train_collector=train_collector,
+        test_collector=test_collector,
         max_epoch=10, step_per_epoch=10000, step_per_collect=10,
         update_per_step=0.1, episode_per_test=100, batch_size=64,
         train_fn=lambda epoch, env_step: policy.set_eps(0.1),
         test_fn=lambda epoch, env_step: policy.set_eps(0.05),
-        stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold)
+        stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold
+    ).run()
     print(f'Finished training! Use {result["duration"]}')
 
-The meaning of each parameter is as follows (full description can be found at :func:`~tianshou.trainer.offpolicy_trainer`):
+The meaning of each parameter is as follows (full description can be found at :class:`~tianshou.trainer.OffpolicyTrainer`):
 
 * ``max_epoch``: The maximum of epochs for training. The training process might be finished before reaching the ``max_epoch``;
 * ``step_per_epoch``: The number of environment step (a.k.a. transition) collected per epoch;
diff --git a/examples/inverse/irl_gail.py b/examples/inverse/irl_gail.py
index 606c228ca..afa4f776a 100644
--- a/examples/inverse/irl_gail.py
+++ b/examples/inverse/irl_gail.py
@@ -15,7 +15,7 @@
 from torch.utils.tensorboard import SummaryWriter
 
 from tianshou.data import Batch, Collector, ReplayBuffer, VectorReplayBuffer
-from tianshou.env import SubprocVectorEnv
+from tianshou.env import SubprocVectorEnv, VectorEnvNormObs
 from tianshou.policy import GAILPolicy
 from tianshou.trainer import OnpolicyTrainer
 from tianshou.utils import TensorboardLogger
@@ -97,15 +97,12 @@ def test_gail(args=get_args()):
     # train_envs = gym.make(args.task)
     train_envs = SubprocVectorEnv(
         [lambda: NoRewardEnv(gym.make(args.task)) for _ in range(args.training_num)],
-        norm_obs=True,
     )
+    train_envs = VectorEnvNormObs(train_envs)
     # test_envs = gym.make(args.task)
-    test_envs = SubprocVectorEnv(
-        [lambda: gym.make(args.task) for _ in range(args.test_num)],
-        norm_obs=True,
-        obs_rms=train_envs.obs_rms,
-        update_obs_rms=False,
-    )
+    test_envs = SubprocVectorEnv([lambda: gym.make(args.task) for _ in range(args.test_num)])
+    test_envs = VectorEnvNormObs(test_envs, update_obs_rms=False)
+    test_envs.set_obs_rms(train_envs.get_obs_rms())
 
     # seed
     np.random.seed(args.seed)