diff --git a/.circleci/config.yml b/.circleci/config.yml index 47f7ad9b18..785b383e10 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2.1 jobs: test-local-gcc: machine: - image: ubuntu-2004:202010-01 + image: default working_directory: ~/criu steps: - checkout @@ -11,7 +11,7 @@ jobs: command: sudo -E make -C scripts/ci local test-local-clang: machine: - image: ubuntu-2004:202010-01 + image: default working_directory: ~/criu steps: - checkout diff --git a/.cirrus.yml b/.cirrus.yml index bd4799fd0b..5e30ca2c2b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,10 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel + # The image has a too old version of nettle which does not work with gnutls. + # Just upgrade to the latest to make the error go away. + dnf -y upgrade nettle nettle-devel systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is @@ -89,69 +92,6 @@ task: build_script: | make -C scripts/ci vagrant-fedora-non-root -task: - name: CentOS Stream 8 based test - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: centos-cloud - image: family/centos-stream-8 - platform: linux - cpu: 4 - memory: 8G - - setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - # Do not fail if latest epel repository definition is already installed - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : - yum install -y dnf-plugins-core - yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-importlib-metadata python3-junit_xml xmlto - alternatives --set python /usr/bin/python3 - systemctl stop sssd - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - - build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" - -task: - name: CentOS 7 based test - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: centos-cloud - image: family/centos-7 - platform: linux - cpu: 4 - memory: 8G - - setup_script: | - # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. - # Do not fail if latest epel repository definition is already installed - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - # Enable user namespaces on CentOS 7 - echo 10000 > /proc/sys/user/max_user_namespaces - # Adapt sudoers to our needs - echo 'root ALL=(ALL:ALL) ALL' | EDITOR='tee -a' visudo - - build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" - task: name: aarch64 build GCC (native) arm_container: diff --git a/.clang-format b/.clang-format index 4756380158..fb40bc613b 100644 --- a/.clang-format +++ b/.clang-format @@ -53,7 +53,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false -ColumnLimit: 120 +ColumnLimit: 0 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false # Unknown to clang-format-4.0 ConstructorInitializerAllOnOneLineOrOnePerLine: false diff --git a/.codespellrc b/.codespellrc index 765dacfabb..dd31dd851c 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] skip = ./.git,./test/pki -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml new file mode 100644 index 0000000000..8b0a63fc7b --- /dev/null +++ b/.github/workflows/actuated-aarch64-test.yaml @@ -0,0 +1,52 @@ +name: Actuated aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: actuated-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + # Actuated runners are not available in all repositories. + if: ${{ github.repository == 'checkpoint-restore/criu' }} + # The memory size and the number of CPUs can be freely selected. + # 3GB and 4 CPUs seems to be enough according to the result from 'vmmeter'. + runs-on: actuated-arm64-4cpu-3gb + strategy: + matrix: + target: [GCC=1, CLANG=1] + + steps: + # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md + # vmmeter start + - name: Prepare arkade + uses: alexellis/arkade-get@master + with: + crane: latest + print-summary: false + + - name: Install vmmeter + run: | + crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin + + - name: Run vmmeter + uses: self-actuated/vmmeter-action@master + # vmmeter end + + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} + # Following tests are failing on the actuated VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 06f466c519..73530d79ae 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -9,12 +9,12 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: target: [GCC=1, CLANG=1] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Alpine ${{ matrix.target }} Test run: sudo -E make -C scripts/ci alpine ${{ matrix.target }} diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml index 328cc9d0f7..425f0662be 100644 --- a/.github/workflows/archlinux-test.yml +++ b/.github/workflows/archlinux-test.yml @@ -9,8 +9,8 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Arch Linux Test run: sudo -E make -C scripts/ci archlinux diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml new file mode 100644 index 0000000000..be2fbd2856 --- /dev/null +++ b/.github/workflows/check-commits.yml @@ -0,0 +1,30 @@ +name: Verify self-contained commits + +on: pull_request + +# Cancel any preceding run on the pull request +concurrency: + group: commit-test-${{ github.event.pull_request.number }} + +jobs: + build: + runs-on: ubuntu-latest + # Check if pull request does not have label "not-selfcontained-ok" + if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" + steps: + - uses: actions/checkout@v3 + with: + # Needed to rebase against the base branch + fetch-depth: 0 + # Checkout pull request HEAD commit instead of merge commit + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: sudo apt-get install -y libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + - name: Configure git user details + run: | + git config --global user.email "checkpoint-restore@users.noreply.github.com" + git config --global user.name "checkpoint-restore" + - name: Configure base branch without switching current branch + run: git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + - name: Build each commit + run: git rebase ${{ github.base_ref }} -x "make -C scripts/ci check-commit" diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml index 79f8f00105..8a64ce1857 100644 --- a/.github/workflows/compat-test.yml +++ b/.github/workflows/compat-test.yml @@ -9,13 +9,13 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: target: [GCC, CLANG] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Compat Tests (${{ matrix.target }}) run: sudo -E make -C scripts/ci local COMPAT_TEST=y ${{ matrix.target }}=1 diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index 927ddced26..b8c8c86d48 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -14,7 +14,7 @@ jobs: branches: [criu-dev, master] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: ref: ${{ matrix.branches }} - name: Run Cross Compilation Targets diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index 4da5d397c6..06b8128231 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -33,7 +33,7 @@ jobs: target: mips64el-unstable-cross steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Cross Compilation Targets run: > sudo make -C scripts/ci ${{ matrix.target }} diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index fabf399fd3..23696905a3 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -12,8 +12,8 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04] + os: [ubuntu-22.04] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Docker Test (${{ matrix.os }}) run: sudo make -C scripts/ci docker-test diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml index 8b1bfcf323..02dc9a1b3f 100644 --- a/.github/workflows/fedora-asan-test.yml +++ b/.github/workflows/fedora-asan-test.yml @@ -9,9 +9,9 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Fedora ASAN Test run: sudo -E make -C scripts/ci fedora-asan diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index 5355aa1926..83e2ead825 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -9,10 +9,10 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Fedora Rawhide Test # We need to pass environment variables from the CI environment to # distinguish between CI environments. However, we need to make sure that diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index fcab478371..cc4e1d44ac 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -9,10 +9,10 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Coverage Tests run: sudo -E make -C scripts/ci local GCOV=1 - name: Run gcov diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml index abed793bf3..cbd3c1f23f 100644 --- a/.github/workflows/java-test.yml +++ b/.github/workflows/java-test.yml @@ -9,8 +9,8 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Java Test run: sudo make -C scripts/ci java-test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e18f921f3e..862d682458 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,9 +14,9 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format ShellCheck + run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set git safe directory # https://github.com/actions/checkout/issues/760 @@ -26,15 +26,15 @@ jobs: run: make lint - name: Run make indent - run: > + continue-on-error: true + run: | if [ -z "${{github.base_ref}}" ]; then - git fetch --deepen=1 && - if ! make indent OPTS=--diff; then - exit 1 - fi + git fetch --deepen=1 + make indent else - git fetch origin ${{github.base_ref}} && - if ! make indent OPTS=--diff BASE=origin/${{github.base_ref}}; then - exit 1 - fi + git fetch origin ${{github.base_ref}} + make indent BASE=origin/${{github.base_ref}} fi + - name: Raise in-line make indent warnings + run: | + git diff | ./scripts/github-indent-warnings.py diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml new file mode 100644 index 0000000000..d7c554c872 --- /dev/null +++ b/.github/workflows/loongarch64-qemu-test.yml @@ -0,0 +1,15 @@ +name: LoongArch64 Qemu Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: loongarch64-qemu-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/.github/workflows/manage-labels.yml b/.github/workflows/manage-labels.yml new file mode 100644 index 0000000000..a2bcd88604 --- /dev/null +++ b/.github/workflows/manage-labels.yml @@ -0,0 +1,14 @@ +name: Remove labels +on: [issue_comment, pull_request_review_comment] +jobs: + remove-labels-on-comments: + name: Remove labels on comments + if: github.event_name == 'issue_comment' + runs-on: ubuntu-latest + steps: + - uses: mondeja/remove-labels-gh-action@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + labels: | + changes requested + awaiting reply diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml index a7013a216f..a07edbe5b2 100644 --- a/.github/workflows/podman-test.yml +++ b/.github/workflows/podman-test.yml @@ -9,8 +9,8 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Podman Test run: sudo make -C scripts/ci podman-test diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml index 0f5b307db9..76bd96edf7 100644 --- a/.github/workflows/stream-test.yml +++ b/.github/workflows/stream-test.yml @@ -9,9 +9,9 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run CRIU Image Streamer Test run: sudo -E make -C scripts/ci local STREAM_TEST=1 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml index b3b50829a4..1f0a469bd5 100644 --- a/.github/workflows/x86-64-clang-test.yml +++ b/.github/workflows/x86-64-clang-test.yml @@ -9,8 +9,8 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run X86_64 CLANG Test run: sudo make -C scripts/ci x86_64 CLANG=1 diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml index ec70b61fb1..15e84a0dfc 100644 --- a/.github/workflows/x86-64-gcc-test.yml +++ b/.github/workflows/x86-64-gcc-test.yml @@ -9,8 +9,8 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run X86_64 GCC Test run: sudo make -C scripts/ci x86_64 diff --git a/.gitignore b/.gitignore index 1ea828bbcd..854657d1c1 100644 --- a/.gitignore +++ b/.gitignore @@ -25,13 +25,6 @@ images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest -crit/crit -criu/arch/*/sys-exec-tbl*.c -# x86 syscalls-table is not generated -!criu/arch/x86/sys-exec-tbl.c -criu/arch/*/syscalls*.S -criu/include/syscall-codes*.h -criu/include/syscall*.h criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h diff --git a/.lgtm.yml b/.lgtm.yml index a884a53ef1..0dd49cda41 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -22,7 +22,6 @@ extraction: - "libbsd-dev" - "python3-yaml" - "libnl-route-3-dev" - - "python-future" - "gnutls-dev" configure: command: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 87da08b343..37965e5fba 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,21 +46,20 @@ This should create the `./criu/criu` executable. ## Edit the source code -If you use ctags, you can generate the ctags file by running - -``` - make tags -``` - When you change the source code, please keep in mind the following code conventions: +* code is written to be read, so the code readability is the most important thing you need to have in mind when preparing patches * we prefer tabs and indentations to be 8 characters width -* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community. +* we prefer line length of 80 characters or less, more is allowed if it helps with code readability +* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community -Other conventions can be learned from the source code itself. In short, make sure your new code -looks similar to what is already there. +Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. -The following command can be used to automatically run a code linter for Python files (flake8), Shell scripts (shellcheck), +## Automatic tools to fix coding-style + +Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. + +The following command can be used to automatically run a code linter for Python files (ruff), Shell scripts (shellcheck), text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` @@ -90,6 +89,41 @@ to check the last *N* commits for formatting errors, without applying the change Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected we need to review the suggested changes and decide if they should be fixed before merging. +Here are some bad examples of clang-format-ing: + +* if clang-format tries to force 120 characters and breaks readability - it is wrong: + +``` +@@ -58,8 +59,7 @@ static int register_membarriers(void) + } + + if (!all_ok) { +- fail("can't register membarrier()s - tried %#x, kernel %#x", +- barriers_registered, barriers_supported); ++ fail("can't register membarrier()s - tried %#x, kernel %#x", barriers_registered, barriers_supported); + return -1; + } +``` + +* if clang-format breaks your beautiful readability friendly alignment in structures, comments or defines - it is wrong: + +``` +--- a/test/zdtm/static/membarrier.c ++++ b/test/zdtm/static/membarrier.c +@@ -27,9 +27,10 @@ static const struct { + int register_cmd; + int execute_cmd; + } membarrier_cmds[] = { +- { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, +- { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, +- { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, ++ { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, ++ { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, ++ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, ++ { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + }; +``` + ## Test your changes CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run diff --git a/Documentation/Makefile b/Documentation/Makefile index 5085514501..de0cc448dc 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -12,11 +12,9 @@ endif FOOTER := footer.txt SRC1 += crit.txt -ifeq ($(PYTHON),python3) SRC1 += criu-ns.txt -endif SRC1 += compel.txt -SRC1 += amdgpu_plugin.txt +SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) diff --git a/Documentation/amdgpu_plugin.txt b/Documentation/criu-amdgpu-plugin.txt similarity index 83% rename from Documentation/amdgpu_plugin.txt rename to Documentation/criu-amdgpu-plugin.txt index 0d490b4292..68803f3dbc 100644 --- a/Documentation/amdgpu_plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -3,7 +3,7 @@ ROCM Support(1) NAME ---- -amdgpu_plugin - A plugin extension to CRIU to support checkpoint/restore in +criu-amdgpu-plugin - A plugin extension to CRIU to support checkpoint/restore in userspace for AMD GPUs. @@ -22,19 +22,15 @@ Though *criu* is a great tool for checkpointing and restoring running applications, it has certain limitations such as it cannot handle applications that have device files open. In order to support *ROCm* based workloads with *criu* we need to augment criu's core functionality with a -plugin based extension mechanism. *amdgpu_plugin* provides the necessary support +plugin based extension mechanism. *criu-amdgpu-plugin* provides the necessary support to criu to allow Checkpoint / Restore with ROCm. Dependencies -~~~~~~~~~~~~~~ +------------ *amdkfd support*:: In order to snapshot the *VRAM* and other *GPU* device states, we require - an updated version of amdkfd(amdgpu) driver. The kernel patches are under - review currently. - -*criu 3.16*:: - This work is rebased on latest criu release available at this time. + an updated version of amdkfd(amdgpu) driver. OPTIONS ------- @@ -97,6 +93,15 @@ executing criu command. E.g: KFD_CAPABILITY_CHECK=1 +*KFD_MAX_BUFFER_SIZE*:: + On some systems, VRAM sizes may exceed RAM sizes, and so buffers for dumping + and restoring VRAM may be unable to fit. Set to a nonzero value (in bytes) + to set a limit on the plugin's memory usage. + Default:0 (Disabled) + + E.g: + KFD_MAX_BUFFER_SIZE="2G" + AUTHOR ------ diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 0e7d19c4cd..606935790b 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -155,6 +155,11 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty + *query-ext-files*::: + called after the process tree is stopped and network is locked. + This hook is used only in the RPC mode. The notification reply + contains file ids to be added to external file list (may be empty). + *--unprivileged*:: This option tells *criu* to accept the limitations when running as non-root. Running as non-root requires *criu* at least to have @@ -457,6 +462,9 @@ The 'mode' may be one of the following: *nftables*::: Use nftables rules to drop the packets. + *skip*::: Don't lock the network. If *--tcp-close* is not used, the network + must be locked externally to allow CRIU to dump TCP connections. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. diff --git a/Makefile b/Makefile index 8061a42c45..e98eed0599 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -35,18 +35,18 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp endif ifeq ($(ARMV),8) # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif @@ -80,6 +80,10 @@ ifeq ($(ARCH),mips) DEFINES := -DCONFIG_MIPS endif +ifeq ($(ARCH),loongarch64) + DEFINES := -DCONFIG_LOONGARCH64 +endif + # # CFLAGS_PIE: # @@ -102,6 +106,7 @@ export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_LARGEFILE64_SOURCE DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes @@ -122,6 +127,10 @@ ifeq ($(ARCH),mips) WARNINGS := -rdynamic endif +ifeq ($(ARCH),loongarch64) +WARNINGS += -Wno-implicit-function-declaration +endif + ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) @@ -151,12 +160,12 @@ export GMON GMONLDOPT endif AFLAGS += -D__ASSEMBLY__ -CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: flog criu lib crit cuda_plugin .PHONY: all # @@ -242,6 +251,15 @@ soccr/built-in.o: $(CONFIG_HEADER) .FORCE $(SOCCR_A): |soccr/built-in.o criu-deps += $(SOCCR_A) +#flog gets used by criu, build it earlier + +flogMakefile: ; +flog%: + $(Q) $(MAKE) $(build)=flog $@ +flog: + $(Q) $(MAKE) $(build)=flog all +.PHONY: flog + # # CRIU building done in own directory # with slightly different rules so we @@ -259,53 +277,50 @@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu -crit/Makefile: ; -crit/%: criu .FORCE - $(Q) $(MAKE) $(build)=crit $@ -crit: criu - $(Q) $(MAKE) $(build)=crit all -.PHONY: crit - unittest: $(criu-deps) $(Q) $(MAKE) $(build)=criu unittest .PHONY: unittest # -# Libraries next once crit it ready +# Libraries next once criu is ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; -lib/%: crit .FORCE +lib/%: criu .FORCE $(Q) $(MAKE) $(build)=lib $@ -lib: crit +lib: criu $(Q) $(MAKE) $(build)=lib all .PHONY: lib clean mrproper: $(Q) $(MAKE) $(build)=images $@ + $(Q) $(MAKE) $(build)=flog $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ - $(Q) $(MAKE) $(build)=lib $@ - $(Q) $(MAKE) $(build)=crit $@ .PHONY: clean mrproper clean-amdgpu_plugin: $(Q) $(MAKE) -C plugins/amdgpu clean .PHONY: clean-amdgpu_plugin +clean-cuda_plugin: + $(Q) $(MAKE) -C plugins/cuda clean +.PHONY: clean-cuda_plugin + clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top -clean: clean-top clean-amdgpu_plugin +clean: clean-top clean-amdgpu_plugin clean-cuda_plugin -mrproper-top: clean-top clean-amdgpu_plugin +mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) @@ -337,6 +352,14 @@ amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin +cuda_plugin: criu + $(Q) $(MAKE) -C plugins/cuda all +.PHONY: cuda_plugin + +crit: lib + $(Q) $(MAKE) -C crit +.PHONY: crit + # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with @@ -402,6 +425,7 @@ help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' + @echo ' * crit - Build crit' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @@ -418,19 +442,26 @@ help: @echo ' lint - Run code linters' @echo ' indent - Indent C code' @echo ' amdgpu_plugin - Make AMD GPU plugin' + @echo ' cuda_plugin - Make NVIDIA CUDA plugin' .PHONY: help -lint: - flake8 --version - flake8 --config=scripts/flake8.cfg test/zdtm.py - flake8 --config=scripts/flake8.cfg test/inhfd/*.py - flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py - flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py - flake8 --config=scripts/flake8.cfg lib/py/images/images.py - flake8 --config=scripts/flake8.cfg scripts/criu-ns - flake8 --config=scripts/flake8.cfg crit/setup.py - flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py - flake8 --config=scripts/flake8.cfg coredump/ +ruff: + @ruff --version + ruff check ${RUFF_FLAGS} --config=scripts/ruff.toml \ + test/zdtm.py \ + test/inhfd/*.py \ + test/others/rpc/config_file.py \ + lib/pycriu/images/pb2dict.py \ + lib/pycriu/images/images.py \ + scripts/criu-ns \ + test/others/criu-ns/run.py \ + crit/*.py \ + crit/crit/*.py \ + scripts/uninstall_module.py \ + coredump/ coredump/coredump \ + scripts/github-indent-warnings.py + +shellcheck: shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install @@ -438,18 +469,23 @@ lint: shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh + shellcheck -x test/others/action-script/*.sh + +codespell: codespell -S tags - # Do not append \n to pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' - # Do not use %m with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|debug|info|msg)|fail)\>.*%m' - # Do not use errno with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>\(".*".*errno' + +lint: ruff shellcheck codespell + # Do not append \n to pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' + # Do not use %m with pr_* or fail + ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|pwarn|debug|info|msg)|fail)\>.*%m' + # Do not use errno with pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>\(".*".*errno' # End pr_(err|warn|msg|info|debug) with \n ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files ! git --no-pager grep -E '\s+$$' \*.c \*.h -.PHONY: lint +.PHONY: lint ruff shellcheck codespell codecov: SHELL := $(shell which bash) codecov: diff --git a/Makefile.config b/Makefile.config index 270ec61c0f..5ab689d411 100644 --- a/Makefile.config +++ b/Makefile.config @@ -2,12 +2,15 @@ include $(__nmk_dir)utils.mk include $(__nmk_dir)msg.mk include scripts/feature-tests.mak +# This is a kludge for $(info ...) to not eat spaces. +S := + ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else $(info Note: Building without setproctitle() and strlcpy() support.) - $(info $(info) To enable these features, please install libbsd-devel (RPM) / libbsd-dev (DEB).) + $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libselinux),y) @@ -23,10 +26,10 @@ endif ifeq ($(call pkg-config-check,libdrm),y) export CONFIG_AMDGPU := y - $(info Note: Building criu with amdgpu_plugin.) + $(info Note: Building with amdgpu_plugin.) else - $(info Note: Building criu without amdgpu_plugin.) - $(info Note: libdrm and libdrm_amdgpu are required to build amdgpu_plugin.) + $(info Note: Building without amdgpu_plugin.) + $(info $S Install libdrm-devel (RPM) or libdrm-dev (DEB) to fix.) endif ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) @@ -34,7 +37,8 @@ ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) export CONFIG_GNUTLS := y FEATURE_DEFINES += -DCONFIG_GNUTLS else - $(info Note: Building without GnuTLS support) + $(info Note: Building without GnuTLS support.) + $(info $S Install gnutls-devel (RPM) or gnutls-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libnftables),y) @@ -46,16 +50,19 @@ ifeq ($(call pkg-config-check,libnftables),y) LIBS_FEATURES += $(LIB_NFTABLES) FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_1 else - $(warning Warn: you have libnftables installed but it has incompatible API) - $(warning Warn: Building without nftables support) + $(info Warn: Building without nftables support (incompatible API version).) endif else - $(warning Warn: you have no libnftables installed) - $(warning Warn: Building without nftables support) + $(info Warn: Building without nftables support.) + $(info $S Install nftables-devel (RPM) or libnftables-dev (DEB) to fix.) endif export LIBS += $(LIBS_FEATURES) +ifneq ($(PLUGINDIR),) + FEATURE_DEFINES += -DCR_PLUGIN_DEFAULT="\"$(PLUGINDIR)\"" +endif + CONFIG_FILE = .config $(CONFIG_FILE): @@ -67,10 +74,10 @@ ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y FEATURE_DEFINES += -DCONFIG_COMPAT else - $(info Note: Building without ia32 C/R, missed ia32 support in gcc) - $(info $(info) That may be related to missing gcc-multilib in your) - $(info $(info) distribution or you may have Debian with buggy toolchain) - $(info $(info) (issue https://github.com/checkpoint-restore/criu/issues/315)) + $(info Note: Building without ia32 C/R, missing ia32 support in gcc.) + $(info $S It may be related to missing gcc-multilib in your) + $(info $S distribution, or you may have Debian with buggy toolchain.) + $(info $S See https://github.com/checkpoint-restore/criu/issues/315.) endif endif @@ -78,14 +85,15 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW MEMFD_CREATE \ + SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ - $(Q) echo '' >> $$@ +else + $(Q) echo '// CONFIG_HAS_$(1) is not set' >> $$@ endif endef diff --git a/Makefile.install b/Makefile.install index c798637beb..455735f3b1 100644 --- a/Makefile.install +++ b/Makefile.install @@ -29,6 +29,29 @@ LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR +# Detect externally managed Python environment (PEP 668). +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES ?= 0 + +# If Python environment is externally managed and PIP_BREAK_SYSTEM_PACKAGES is not set, skip pip install. +SKIP_PIP_INSTALL := 0 +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + +SKIP_PIP_INSTALL := 1 +$(info Warn: Externally managed python environment) +$(info Consider using PIP_BREAK_SYSTEM_PACKAGES=1) + +endif +endif + +# Default flags for pip install: +# --upgrade: Upgrade crit/pycriu packages +# --ignore-installed: Ignore existing packages and reinstall them +PIPFLAGS ?= --upgrade --ignore-installed + +export SKIP_PIP_INSTALL PIPFLAGS + install-man: $(Q) $(MAKE) -C Documentation install .PHONY: install-man @@ -37,6 +60,10 @@ install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib +install-crit: lib + $(Q) $(MAKE) $(build)=crit install +.PHONY: install-crit + install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu @@ -45,19 +72,25 @@ install-amdgpu_plugin: amdgpu_plugin $(Q) $(MAKE) -C plugins/amdgpu install .PHONY: install-amdgpu_plugin +install-cuda_plugin: cuda_plugin + $(Q) $(MAKE) -C plugins/cuda install +.PHONY: install-cuda_plugin + install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-criu install-compel install-amdgpu_plugin ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) -C plugins/amdgpu $@ + $(Q) $(MAKE) -C plugins/cuda $@ .PHONY: uninstall diff --git a/README.md b/README.md index ff4aa1a239..f578e745c9 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,10 @@ Pages worth starting with are: - [Installation instructions](http://criu.org/Installation) - [A simple example of usage](http://criu.org/Simple_loop) - [Examples of more advanced usage](https://criu.org/Category:HOWTO) -- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) +- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/index.php?title=FAQ) ### Checkpoint and restore of simple loop process -[

](https://asciinema.org/a/232445) +

## Advanced features diff --git a/compel/Makefile b/compel/Makefile index b79aee6871..78ec4826af 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -33,7 +33,7 @@ lib-y += arch/$(ARCH)/src/lib/thread_area.o endif # handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64,$(ARCH)),) +ifneq ($(filter arm aarch64 loongarch64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index f8ec55d6c0..9152024fd8 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -18,7 +18,7 @@ struct aux_context { struct _aarch64_ctx end; }; -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index d0189f0039..812ba34a37 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -59,10 +59,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; struct iovec iov; int ret; diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 8bcc3cc50a..9a33009eb0 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -39,7 +39,7 @@ recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, str sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) -bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +bind 200 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) @@ -118,7 +118,9 @@ fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) +close_range 436 436 (unsigned int fd, unsigned int max_fd, unsigned int flags) pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index 7700f52caf..8b810a88f5 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -65,10 +65,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr } #define PTRACE_GETVFPREGS 27 -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *vfp, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *vfp = ext_regs ? ext_regs : &tmp; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); diff --git a/compel/arch/loongarch64/plugins/include/asm/prologue.h b/compel/arch/loongarch64/plugins/include/asm/prologue.h new file mode 100644 index 0000000000..c19ce54d7a --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/asm/syscall-types.h b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000000..b883bd8bed --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h @@ -0,0 +1,30 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#include +/* Types for sigaction, sigprocmask syscalls */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +/* refer to arch/loongarch/include/uapi/asm/signal.h */ +#define _KNSIG 64 +#define _NSIG_BPW BITS_PER_LONG +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#define SA_RESTORER 0x04000000 + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/features.h b/compel/arch/loongarch64/plugins/include/features.h new file mode 100644 index 0000000000..b4a3cded2b --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/compel/arch/loongarch64/plugins/std/parasite-head.S b/compel/arch/loongarch64/plugins/std/parasite-head.S new file mode 100644 index 0000000000..3a960490eb --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/parasite-head.S @@ -0,0 +1,9 @@ + +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + bl parasite_service; + break 0; +END(__export_parasite_head_start) + diff --git a/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000000..0d08f34e1d --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,117 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-loongarch-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S new file mode 100644 index 0000000000..fff8944669 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S @@ -0,0 +1,44 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ +ENTRY(name); \ + addi.d $a7, $zero, opcode; \ + syscall 0; \ + jirl $r0, $r1, 0; \ +END(name) + +#ifndef AT_FDCWD +#define AT_FDCWD -100 +#endif + +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif + +ENTRY(sys_open) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_openat +END(sys_open) + +ENTRY(sys_mkdir) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_mkdirat +END(sys_mkdir) + +ENTRY(sys_rmdir) + addi.d $a2, $zero, AT_REMOVEDIR + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_unlinkat +END(sys_rmdir) + +ENTRY(__cr_restore_rt) + addi.d $a7, $zero, __NR_rt_sigreturn + syscall 0 +END(__cr_restore_rt) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 0000000000..aa6ffb44d1 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,122 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. +# +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 32 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 39 sys_umount2 (char *name, int flags) +__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 57 sys_close (int fd) +__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 63 sys_read (int fd, void *buf, unsigned long count) +__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 92 sys_personality (unsigned int personality) +__NR_exit 93 sys_exit (unsigned long error_code) +__NR_exit_group 94 sys_exit_group (int error_code) +__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) +__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 128 sys_restart_syscall (void) +__NR_kill 129 sys_kill (long pid, int sig) +__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 139 sys_rt_sigreturn (void) +__NR_setpriority 140 sys_setpriority (int which, int who, int nice) +__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 155 sys_getpgid (pid_t pid) +__NR_setfsuid 151 sys_setfsuid (int fsuid) +__NR_setfsgid 152 sys_setfsgid (int fsgid) +__NR_getsid 156 sys_getsid (void) +__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 166 sys_umask (int mask) +__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 172 sys_getpid (void) +__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 178 sys_gettid (void) +__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 198 sys_socket (int domain, int type, int protocol) +__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 210 sys_shutdown (int sockfd, int how) +__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 214 sys_brk (void *addr) +__NR_munmap 215 sys_munmap (void *addr, unsigned long len) +__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 268 sys_setns (int fd, int nstype) +__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 282 sys_userfaultfd (int flags) +__NR_membarrier 283 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) +#__NR_rmdir ! sys_rmdir (const char *name) +#__NR_unlink ! sys_unlink (char *pathname) +#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) +#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) +#__NR_mkdir ! sys_mkdir (const char *name, int mode) +#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) diff --git a/compel/arch/loongarch64/scripts/compel-pack.lds.S b/compel/arch/loongarch64/scripts/compel-pack.lds.S new file mode 100644 index 0000000000..cfb7a2fb35 --- /dev/null +++ b/compel/arch/loongarch64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(loongarch) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} diff --git a/compel/arch/loongarch64/src/lib/cpu.c b/compel/arch/loongarch64/src/lib/cpu.c new file mode 100644 index 0000000000..172b90e275 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/cpu.c @@ -0,0 +1,41 @@ +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } + + return compel_test_cpu_cap(&rt_info, feature); +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf-host.c b/compel/arch/loongarch64/src/lib/handle-elf-host.c new file mode 100644 index 0000000000..a605a5a452 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf-host.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf.c b/compel/arch/loongarch64/src/lib/handle-elf.c new file mode 100644 index 0000000000..a605a5a452 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/include/handle-elf.h b/compel/arch/loongarch64/src/lib/include/handle-elf.h new file mode 100644 index 0000000000..b0a66ef879 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/handle-elf.h @@ -0,0 +1,8 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define arch_is_machine_supported(e_machine) (e_machine == EM_LOONGARCH) + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/syscall.h b/compel/arch/loongarch64/src/lib/include/syscall.h new file mode 100644 index 0000000000..ac3e2799ac --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +#ifndef SIGSTKFLT +#define SIGSTKFLT 16 +#endif + +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000000..21eb1309f2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000000..e568df789c --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; +#endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000000..7f476d541a --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000000..0b047a5b08 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,67 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * From the Linux kernel header arch/loongarch/include/uapi/asm/ptrace.h + * + * A thread LoongArch CPU context + * + * struct user_fp_state { + * uint64_t fpr[32]; + * uint64_t fcc; + * uint32_t fcsr; + * }; + * + * struct user_pt_regs { + * unsigned long regs[32]; + * unsigned long csr_era; + * unsigned long csr_badv; + * unsigned long reserved[11]; + * }; + */ + +struct user_gp_regs { + uint64_t regs[32]; + uint64_t orig_a0; + uint64_t pc; + uint64_t csr_badv; + uint64_t reserved[10]; +} __attribute__((aligned(8))); + +struct user_fp_regs { + uint64_t regs[32]; + uint64_t fcc; + uint32_t fcsr; +}; + +typedef struct user_gp_regs user_regs_struct_t; +typedef struct user_fp_regs user_fpregs_struct_t; + +#define user_regs_native(regs) true + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(r) ((uint64_t)(r).regs[4]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SP(r) ((uint64_t)(r).regs[3]) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[11]) +#define SET_REG_IP(r, val) ((r).pc = (val)) + +#define GPR_NUM 32 +#define FPR_NUM 32 + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000000..fcb545a1d2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,86 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include +#include + +#include +#include + +#include + +#define rt_sigcontext sigcontext +/* sigcontext defined in usr/include/uapi/asm/sigcontext.h*/ +#include +typedef __u32 u32; + +typedef struct sigcontext_t { + __u64 pc; + __u64 regs[32]; + __u32 flags; + __u64 extcontext[0] __attribute__((__aligned__(16))); +} sigcontext_t; + +typedef struct context_info_t { + __u32 magic; + __u32 size; + __u64 padding; +} context_info_t; + +#define FPU_CTX_MAGIC 0x46505501 +#define FPU_CTX_ALIGN 8 +typedef struct fpu_context_t { + __u64 regs[32]; + __u64 fcc; + __u64 fcsr; +} fpu_context_t; + +typedef struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + sigset_t uc_sigmask; + __u8 __unused[1024 / 8 - sizeof(sigset_t)]; + sigcontext_t uc_mcontext; +} ucontext; + +/* Copy from the kernel source arch/loongarch/kernel/signal.c */ +struct rt_sigframe { + rt_siginfo_t rs_info; + ucontext rs_uc; +}; + +#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe->rs_uc)) +#define RT_SIGFRAME_SIGMASK(rt_sigframe) ((k_rtsigset_t *)&RT_SIGFRAME_UC(rt_sigframe)->uc_sigmask) +#define RT_SIGFRAME_SIGCTX(rt_sigframe) (&(RT_SIGFRAME_UC(rt_sigframe)->uc_mcontext)) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(RT_SIGFRAME_SIGCTX(rt_sigframe)->pc)) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) + +#define RT_SIGFRAME_FPU(rt_sigframe) \ + ({ \ + context_info_t *ctx = (context_info_t *)RT_SIGFRAME_SIGCTX(rt_sigframe)->extcontext; \ + ctx->magic = FPU_CTX_MAGIC; \ + ctx->size = sizeof(context_info_t) + sizeof(fpu_context_t); \ + (fpu_context_t *)((char *)ctx + sizeof(context_info_t)); \ + }) + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "addi.d $sp, %0, 0 \n" \ + "addi.d $a7, $zero, "__stringify(__NR_rt_sigreturn)" \n" \ + "syscall 0" \ + : \ + :"r"(new_sp) \ + : "$a7", "memory") +/* clang-format on */ + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); + +#define rt_sigframe_erase_sigset(sigframe) memset(RT_SIGFRAME_SIGMASK(sigframe), 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(RT_SIGFRAME_SIGMASK(sigframe), from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c new file mode 100644 index 0000000000..8e3c19aff2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include + +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "common/page.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" +#include "common/bug.h" + +/* + * Injected syscall instruction + * loongarch64 is Little Endian + */ +const char code_syscall[] = { + 0x00, 0x00, 0x2b, 0x00, /* syscall */ + 0x00, 0x00, 0x2a, 0x00 /* break */ +}; + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigcontext_t *sc; + fpu_context_t *fpu; + + sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, regs->regs, sizeof(regs->regs)); + sc->pc = regs->pc; + + fpu = RT_SIGFRAME_FPU(sigframe); + memcpy(fpu->regs, fpregs->regs, sizeof(fpregs->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { + pr_perror("Failed to obtain CPU registers for %d", pid); + goto err; + } + + /* + * Refer to Linux kernel arch/loongarch/kernel/signal.c + */ + if (regs->regs[0]) { + switch (regs->regs[4]) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->regs[4] = regs->orig_a0; + regs->pc -= 4; + break; + case -ERESTART_RESTARTBLOCK: + regs->regs[4] = regs->orig_a0; + regs->regs[11] = __NR_restart_syscall; + regs->pc -= 4; + break; + } + regs->regs[0] = 0; /* Don't deal with this again. */ + } + + iov.iov_base = fpregs; + iov.iov_len = sizeof(user_fpregs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + goto err; + } + + ret = save(arg, regs, fpregs); +err: + return 0; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +/* + * Registers $4 ~ $11 represents arguments a0 ~ a7, especially a7 is + * used as syscall number. + */ +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + int err; + user_regs_struct_t regs = ctl->orig.regs; + + regs.regs[11] = (unsigned long)nr; + regs.regs[4] = arg1; + regs.regs[5] = arg2; + regs.regs[6] = arg3; + regs.regs[7] = arg4; + regs.regs[8] = arg5; + regs.regs[9] = arg6; + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.regs[4]; + + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); + + if (err < 0 || IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->regs[4] = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * TODO: add feature + */ +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +/* + * Refer to Linux kernel arch/loongarch/include/asm/processor.h + */ +#define TASK_SIZE32 (1UL) << 31 +#define TASK_SIZE64_MIN (1UL) << 40 +#define TASK_SIZE64_MAX (1UL) << 48 + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + for (task_size = TASK_SIZE64_MIN; task_size < TASK_SIZE64_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 505ec849d7..85faca5a92 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -115,7 +115,9 @@ __NR_fsopen 5430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 5431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 5436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 5318 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index afa0f5ed5f..0e98aaee3f 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -119,10 +119,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index af40d71045..c56b4e6de6 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -114,7 +114,9 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 365 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index 8cc94ba740..0c4ccb6486 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -14,7 +14,7 @@ */ #include -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index db999ce37f..84c2b1d7c3 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -11,6 +11,7 @@ #include "log.h" #include "common/bug.h" #include "common/page.h" +#include "common/err.h" #include "infect.h" #include "infect-priv.h" @@ -303,33 +304,58 @@ static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) return -1; /* still failing the checkpoint */ } -static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) -{ - pr_info("Dumping GP/FPU registers for %d\n", pid); +/* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ - /* - * This is inspired by kernel function check_syscall_restart in - * arch/powerpc/kernel/signal.c - */ #ifndef TRAP #define TRAP(r) ((r).trap & ~0xF) #endif - if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { - /* Restart the system call */ - switch (regs->gpr[3]) { - case ERESTARTNOHAND: - case ERESTARTSYS: - case ERESTARTNOINTR: - regs->gpr[3] = regs->orig_gpr3; - regs->nip -= 4; - break; - case ERESTART_RESTARTBLOCK: - pr_warn("Will restore %d with interrupted system call\n", pid); - regs->gpr[3] = EINTR; - break; - } +static bool trap_is_scv(user_regs_struct_t *regs) +{ + return TRAP(*regs) == 0x3000; +} + +static bool trap_is_syscall(user_regs_struct_t *regs) +{ + return trap_is_scv(regs) || TRAP(*regs) == 0x0C00; +} + +static void handle_syscall(pid_t pid, user_regs_struct_t *regs) +{ + unsigned long ret = regs->gpr[3]; + + if (trap_is_scv(regs)) { + if (!IS_ERR_VALUE(ret)) + return; + ret = -ret; + } else if (!(regs->ccr & 0x10000000)) { + return; + } + + /* Restart or interrupt the system call */ + switch (ret) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->gpr[3] = trap_is_scv(regs) ? -EINTR : EINTR; + break; } +} + +static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (trap_is_syscall(regs)) + handle_syscall(pid, regs); /* Resetting trap since we are now coming from user space. */ regs->trap = 0; @@ -365,10 +391,9 @@ static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_stru return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; int ret; ret = __get_task_regs(pid, regs, fpregs); diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index 6a349e1cb7..018d58a590 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -114,7 +114,9 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 356 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 3cd25e71d8..85dfc3a4d4 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -293,10 +293,9 @@ static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) /* * Prepare task registers for restart */ -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; struct iovec iov; int rewind; diff --git a/compel/arch/x86/plugins/std/parasite-head.S b/compel/arch/x86/plugins/std/parasite-head.S index 4fb38d1f14..42cad4808c 100644 --- a/compel/arch/x86/plugins/std/parasite-head.S +++ b/compel/arch/x86/plugins/std/parasite-head.S @@ -34,7 +34,21 @@ END(__export_parasite_head_start_compat) .code64 #endif +/* + * When parasite_service() runs in the daemon mode it will return the stack + * pointer for the sigreturn frame in %rax and we call sigreturn directly + * from here. + * Since a valid stack pointer is positive, it is safe to presume that + * return value <= 0 means that parasite_service() called parasite_trap_cmd() + * in non-daemon mode, and the parasite should stop at int3. + */ ENTRY(__export_parasite_head_start) call parasite_service + cmp $0, %rax + jle 1f + movq %rax, %rsp + movq $15, %rax + syscall +1: int $0x03 END(__export_parasite_head_start) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index a119a59b2e..cc23dc3f35 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -102,7 +102,9 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 375 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 16dd86e791..7fbfd69ad1 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -113,7 +113,10 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_map_shadow_stack 453 sys_map_shadow_stack (unsigned long addr, unsigned long size, unsigned int flags) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h index 63ff83dbeb..11c50e0e56 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h @@ -244,6 +244,7 @@ enum cpuid_leafs { #define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (11 * 32 + 7) /* Shadow Stack */ #define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (11 * 32 + 10) /* Carry-Less Multiplication Double Quadword */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index bd3b0cbd5c..d595a68fce 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -21,7 +21,28 @@ #define XSTATE_YMM 0x4 #define FXSAVE_SIZE 512 -#define XSAVE_SIZE 4096 +/* + * This used to be 4096 (one page). There is a comment below concerning + * this size: + * "One page should be enough for the whole xsave state ;-)" + * Which is kind of funny as it is no longer enough ;-) + * + * Older CPUs: + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) + * + * Newer CPUs (Sapphire Rapids): + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) + * + * So one page is no longer enough... But: + * + * Four pages should be enough for the whole xsave state ;-) + */ + +#define XSAVE_SIZE 4*4096 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE @@ -224,6 +245,14 @@ struct pkru_state { uint32_t pad; } __packed; +/* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + uint64_t cet; /* user control-flow settings */ + uint64_t ssp; /* user shadow stack pointer */ +}; + /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. @@ -235,8 +264,11 @@ struct pkru_state { * * * One page should be enough for the whole xsave state ;-) + * + * Of course it was not ;-) Now using four pages... + * */ -#define EXTENDED_STATE_AREA_SIZE (4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) +#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct) - sizeof(struct cet_user_state)) /* * cpu requires it to be 64 byte aligned @@ -252,6 +284,7 @@ struct xsave_struct { struct ymmh_struct ymmh; uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; + struct cet_user_state cet; } __aligned(FP_MIN_ALIGN_BYTES) __packed; struct xsave_struct_ia32 { diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h index b35504ff88..b998c488c7 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -143,4 +143,11 @@ typedef struct xsave_struct user_fpregs_struct_t; */ #define __NR32_mmap __NR32_mmap2 +extern bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs); +#define compel_shstk_enabled __compel_shstk_enabled + +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index ec8c156fa4..4a2e675597 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -177,6 +177,24 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define USER32_CS 0x23 /* clang-format off */ +/* + * rst_sigreturn in resorer is noninline call which adds an entry to the + * shadow stack above the sigframe token; + * if shadow stack is enabled, increment the shadow stack pointer to remove + * that entry + */ +#define ARCH_SHSTK_POP() \ + asm volatile( \ + "xor %%rax, %%rax\n" \ + "rdsspq %%rax\n" \ + "cmpq $0, %%rax\n" \ + "jz 1f\n" \ + "movq $1, %%rax\n" \ + "incsspq %%rax\n" \ + "1:\n" \ + : : \ + : "rax") + #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ "movq %0, %%rax \n" \ @@ -203,10 +221,19 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) : "rdi"(new_sp) \ : "eax", "r8", "r9", "r10", "r11", "memory") -#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ +#define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ do { \ - if ((rt_sigframe)->is_native) \ + if ((rt_sigframe)->is_native) { \ + ARCH_SHSTK_POP(); \ ARCH_RT_SIGRETURN_NATIVE(new_sp); \ + } else \ + ARCH_RT_SIGRETURN_COMPAT(new_sp); \ +} while (0) + +#define ARCH_RT_SIGRETURN_DUMP(new_sp, rt_sigframe) \ +do { \ + if ((rt_sigframe)->is_native) \ + return new_sp; \ else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 01959b95b2..a07b1c9f37 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -26,6 +26,16 @@ #ifndef NT_X86_XSTATE #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ #endif + +#ifndef NT_X86_SHSTK +#define NT_X86_SHSTK 0x204 /* x86 shstk state */ +#endif + +#ifndef ARCH_SHSTK_STATUS +#define ARCH_SHSTK_STATUS 0x5005 +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#endif + #ifndef NT_PRSTATUS #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ #endif @@ -220,6 +230,16 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr #define get_signed_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : (int32_t)((pregs)->compat.name)) +static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) +{ + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; +} + static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) { struct iovec iov; @@ -232,14 +252,57 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) return -1; } - return 0; -} + if ((xsave->xsave_hdr.xstate_bv & 3) != 3) { + // Due to init-optimisation [1] x87 FPU or SSE state may not be filled in. + // Since those are restored unconditionally, make sure the init values are + // filled by retrying with old PTRACE_GETFPREGS. + // + // [1] Intel® 64 and IA-32 Architectures Software Developer's + // Manual Volume 1: Basic Architecture + // Section 13.6: Processor tracking of XSAVE-managed state + if (get_task_fpregs(pid, xsave)) + return -1; + } -static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) -{ - if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { - pr_perror("Can't obtain FPU registers for %d", pid); - return -1; + /* + * xsave may be on stack, if we don't clear it explicitly we get + * funky shadow stack state + */ + memset(&xsave->cet, 0, sizeof(xsave->cet)); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + unsigned long ssp = 0; + unsigned long features = 0; + + if (ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long)&features, ARCH_SHSTK_STATUS)) { + /* + * kernels that don't support shadow stack return + * -EINVAL + */ + if (errno == EINVAL) + return 0; + + pr_perror("shstk: can't get shadow stack status for %d", pid); + return -1; + } + + if (!(features & ARCH_SHSTK_SHSTK)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: can't get SSP for %d", pid); + return -1; + } + } + + xsave->cet.cet = features; + xsave->cet.ssp = ssp; + + pr_debug("%d: shstk: cet: %lx ssp: %lx\n", pid, xsave->cet.cet, xsave->cet.ssp); } return 0; @@ -334,10 +397,9 @@ static int corrupt_extregs(pid_t pid) return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, void *arg, unsigned long flags) { - user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping general registers for %d in %s mode\n", pid, user_regs_native(regs) ? "native" : "compat"); @@ -687,3 +749,59 @@ unsigned long compel_task_size(void) { return TASK_SIZE; } + +bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return false; + + if (ext_regs->cet.cet & ARCH_SHSTK_SHSTK) + return true; + + return false; +} + +int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +{ + pid_t pid = ctl->rpid; + unsigned long sa_restorer = ctl->parasite_ip; + unsigned long long ssp; + unsigned long token; + struct iovec iov; + + if (!compel_shstk_enabled(ext_regs)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: %d: cannot get SSP", pid); + return -1; + } + } + + /* The token is for 64-bit */ + token = ALIGN_DOWN(ssp, 8); + token |= (1UL << 63); + ssp = ALIGN_DOWN(ssp, 8) - 8; + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, token)) { + pr_perror("shstk: %d: failed to inject shadow stack token", pid); + return -1; + } + + ssp = ssp - sizeof(uint64_t); + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, sa_restorer)) { + pr_perror("shstk: %d: failed to inject restorer address", pid); + return -1; + } + + ssp = ssp + sizeof(uint64_t); + if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + pr_perror("shstk: %d: cannot write SSP", pid); + return -1; + } + + return 0; +} diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 3bd36dda15..7e6134f4bc 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -120,6 +120,7 @@ struct infect_ctx { open_proc_fn open_proc; int log_fd; /* fd for parasite code to send messages to */ + unsigned long remote_map_addr; /* User-specified address where to mmap parasitic code, default not set */ }; extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); @@ -182,4 +183,21 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); +#ifndef compel_shstk_enabled +static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + return false; +} +#define compel_shstk_enabled +#endif + +#ifndef parasite_setup_shstk +static inline int parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs) +{ + return 0; +} +#define parasite_setup_shstk parasite_setup_shstk +#endif + #endif diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h index 63dfee97fd..558124fbd6 100644 --- a/compel/include/uapi/ptrace.h +++ b/compel/include/uapi/ptrace.h @@ -86,6 +86,19 @@ struct __ptrace_rseq_configuration { #define PTRACE_EVENT_STOP 128 #endif +/* + * Amazon Linux 2 uses glibc 2.26. PTRACE_ARCH_PRCTL was added in glibc 2.27. + * This allows CRIU to build on Amazon Linux 2. + * + * Note that in sys/ptrace.h, PTRACE_ARCH_PRCTL is an enum value so the + * preprocessor doesn't know about it. PT_ARCH_PRCTL is the preprocessor symbol + * that matches the value of PTRACE_ARCH_PRCTL. So look for PT_ARCH_PRCTL to + * decide if PTRACE_ARCH_PRCTL is available or not. + */ +#if defined(__x86_64__) && !defined(PT_ARCH_PRCTL) +#define PTRACE_ARCH_PRCTL 30 /* From asm/ptrace-abi.h. */ +#endif + extern int ptrace_suspend_seccomp(pid_t pid); extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); diff --git a/compel/plugins/include/uapi/std/infect.h b/compel/plugins/include/uapi/std/infect.h index 08a5a7a804..a729abbd2b 100644 --- a/compel/plugins/include/uapi/std/infect.h +++ b/compel/plugins/include/uapi/std/infect.h @@ -7,7 +7,7 @@ extern int parasite_get_rpc_sock(void); extern unsigned int __export_parasite_service_cmd; extern void *__export_parasite_service_args_ptr; -extern int __must_check parasite_service(void); +extern unsigned long __must_check parasite_service(void); /* * Must be supplied by user plugins. diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index abecc140f1..034201320f 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -16,6 +16,10 @@ #include "rpc-pie-priv.h" +#ifndef ARCH_RT_SIGRETURN_DUMP +#define ARCH_RT_SIGRETURN_DUMP ARCH_RT_SIGRETURN +#endif + static int tsock = -1; static struct rt_sigframe *sigframe; @@ -27,7 +31,7 @@ static struct rt_sigframe *sigframe; */ static unsigned __page_size; -unsigned __attribute((weak)) page_size(void) +unsigned long __attribute((weak)) page_size(void) { return __page_size; } @@ -79,12 +83,13 @@ static int __parasite_daemon_wait_msg(struct ctl_msg *m) /* Core infect code */ -static noinline void fini_sigreturn(unsigned long new_sp) +static noinline unsigned long fini_sigreturn(unsigned long new_sp) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_DUMP(new_sp, sigframe); + return new_sp; } -static int fini(void) +static unsigned long fini(void) { unsigned long new_sp; @@ -96,14 +101,14 @@ static int fini(void) sys_close(tsock); std_log_set_fd(-1); - fini_sigreturn(new_sp); + return fini_sigreturn(new_sp); BUG(); return -1; } -static noinline __used int noinline parasite_daemon(void *args) +static noinline __used unsigned long parasite_daemon(void *args) { struct ctl_msg m; int ret = -1; @@ -140,12 +145,10 @@ static noinline __used int noinline parasite_daemon(void *args) } out: - fini(); - - return 0; + return fini(); } -static noinline __used int parasite_init_daemon(void *data) +static noinline __used unsigned long parasite_init_daemon(void *data) { struct parasite_init_args *args = data; int ret; @@ -178,14 +181,11 @@ static noinline __used int parasite_init_daemon(void *data) } else goto err; - parasite_daemon(data); + return parasite_daemon(data); err: futex_set_and_wake(&args->daemon_connected, ret); - fini(); - BUG(); - - return -1; + return fini(); } #ifndef __parasite_entry @@ -203,7 +203,7 @@ static noinline __used int parasite_init_daemon(void *data) unsigned int __export_parasite_service_cmd = 0; void *__export_parasite_service_args_ptr = NULL; -int __used __parasite_entry parasite_service(void) +unsigned long __used __parasite_entry parasite_service(void) { unsigned int cmd = __export_parasite_service_cmd; void *args = __export_parasite_service_args_ptr; diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 5aab7aa3ee..1e3ffb9670 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -589,7 +589,7 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t } if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); goto err; } @@ -739,6 +739,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; struct infect_ctx *ictx = &ctl->ictx; + user_fpregs_struct_t ext_regs; /* * Get task registers before going daemon, since the @@ -746,7 +747,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) * while in daemon it is not such. */ - if (compel_get_task_regs(pid, &ctl->orig.regs, NULL, ictx->save_regs, ictx->regs_arg, ictx->flags)) { + if (compel_get_task_regs(pid, &ctl->orig.regs, &ext_regs, ictx->save_regs, ictx->regs_arg, ictx->flags)) { pr_err("Can't obtain regs for thread %d\n", pid); return -1; } @@ -759,6 +760,9 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) return -1; + if (parasite_setup_shstk(ctl, &ext_regs)) + return -1; + if (parasite_init_daemon(ctl)) return -1; @@ -812,7 +816,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; - int ret, fd, lfd; + int ret, fd, lfd, remote_flags; if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; @@ -856,7 +860,11 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, goto err_cure; } - ctl->remote_map = remote_mmap(ctl, NULL, size, remote_prot, MAP_FILE | MAP_SHARED, fd, 0); + remote_flags = MAP_FILE | MAP_SHARED; + if (ctl->ictx.remote_map_addr){ + remote_flags |= MAP_FIXED_NOREPLACE; + } + ctl->remote_map = remote_mmap(ctl, (void *)ctl->ictx.remote_map_addr, size, remote_prot, remote_flags, fd, 0); if (!ctl->remote_map) { pr_err("Can't rmap memfd for parasite blob\n"); goto err_curef; @@ -1398,7 +1406,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); return -1; } @@ -1577,7 +1585,7 @@ int compel_stop_pie(pid_t pid, void *addr, bool no_bp) int ret; if (no_bp) { - pr_debug("Force no-breakpoints restore\n"); + pr_debug("Force no-breakpoints restore of %d\n", pid); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); diff --git a/compel/src/main.c b/compel/src/main.c index ef05a46d01..bc16c0ab41 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -57,6 +57,9 @@ static const flags_t flags = { #elif defined CONFIG_MIPS .arch = "mips", .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_LOONGARCH64 + .arch = "loongarch64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif diff --git a/compel/test/fdspy/spy.c b/compel/test/fdspy/spy.c index 7f20ea2a7f..41de99e200 100644 --- a/compel/test/fdspy/spy.c +++ b/compel/test/fdspy/spy.c @@ -110,11 +110,11 @@ static int check_pipe_ends(int wfd, int rfd) printf("Check pipe ends are connected\n"); if (write(wfd, "1", 2) != 2) { fprintf(stderr, "write to pipe failed\n"); - return -1; + return 0; } if (read(rfd, aux, sizeof(aux)) != sizeof(aux)) { fprintf(stderr, "read from pipe failed\n"); - return -1; + return 0; } if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst index c2d1509fa1..ce45f1b7cf 100644 --- a/contrib/debian/dev-packages.lst +++ b/contrib/debian/dev-packages.lst @@ -17,4 +17,3 @@ libcap-dev libaio-dev python3-yaml libnl-route-3-dev -python-future diff --git a/coredump/coredump.py b/coredump/coredump old mode 100644 new mode 100755 similarity index 88% rename from coredump/coredump.py rename to coredump/coredump index 88a1b374c6..3fbdafe81c --- a/coredump/coredump.py +++ b/coredump/coredump @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 +import platform import argparse import os import sys @@ -35,6 +37,10 @@ def main(): opts = vars(parser.parse_args()) + if platform.machine() != 'x86_64': + print('ERROR: %s only supported on x86_64' % sys.argv[0]) + sys.exit(1) + try: coredump(opts) except SystemExit as error: diff --git a/coredump/coredump-python2 b/coredump/coredump-python2 deleted file mode 100755 index 564c05ce9f..0000000000 --- a/coredump/coredump-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -import coredump - -if __name__ == '__main__': - coredump.main() diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 8ee4026768..20ec8e5dc8 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -35,12 +35,6 @@ from pycriu import images from . import elf - -try: - from itertools import ifilter as filter -except ImportError: - pass - # Some memory-related constants PAGESIZE = 4096 status = { @@ -59,6 +53,7 @@ "VMA_AREA_SOCKET": 1 << 11, "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, + "VMA_AREA_MEMFD": 1 << 14, "VMA_AREA_UNSUPP": 1 << 31 } @@ -318,10 +313,7 @@ def _gen_prpsinfo(self, pid): # prpsinfo.pr_psargs has a limit of 80 characters which means it will # fail here if the cmdline is longer than 80 prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] - if (sys.version_info > (3, 0)): - prpsinfo.pr_fname = core["tc"]["comm"].encode() - else: - prpsinfo.pr_fname = core["tc"]["comm"] + prpsinfo.pr_fname = core["tc"]["comm"].encode() nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -581,10 +573,7 @@ class elf_files(ctypes.Structure): setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) - if (sys.version_info > (3, 0)): - setattr(data, "name" + str(i), info.name.encode()) - else: - setattr(data, "name" + str(i), info.name) + setattr(data, "name" + str(i), info.name.encode()) nhdr = elf.Elf64_Nhdr() diff --git a/coredump/pycriu b/coredump/pycriu index d13a8790a9..d1b6ed5c45 120000 --- a/coredump/pycriu +++ b/coredump/pycriu @@ -1 +1 @@ -../lib/py/ \ No newline at end of file +../lib/pycriu \ No newline at end of file diff --git a/crit/.gitignore b/crit/.gitignore index 810661179d..10c8ab1869 100644 --- a/crit/.gitignore +++ b/crit/.gitignore @@ -1,2 +1,4 @@ crit.egg-info/ build/ +dist/ +version.py diff --git a/crit/Makefile b/crit/Makefile index 988b481b63..33bd68eedc 100644 --- a/crit/Makefile +++ b/crit/Makefile @@ -1,13 +1,25 @@ +VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) -all-y += crit +all-y += ${VERSION_FILE} +cleanup-y += ${VERSION_FILE} -crit/crit: crit/crit-$(PYTHON) - $(Q) cp $^ $@ -crit: crit/crit -.PHONY: crit +${VERSION_FILE}: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ -clean-crit: - $(Q) $(RM) crit/crit -.PHONY: clean-crit -clean: clean-crit -mrproper: clean +install: ${VERSION_FILE} +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./crit +else + $(E) " SKIP INSTALL crit" +endif +.PHONY: install + +uninstall: +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +else + $(E) " SKIP UNINSTALL crit" +endif +.PHONY: uninstall diff --git a/crit/crit-python2 b/crit/crit-python2 deleted file mode 100755 index b0b7d3c3a0..0000000000 --- a/crit/crit-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -from pycriu import cli - -if __name__ == '__main__': - cli.main() diff --git a/crit/crit-python3 b/crit/crit-python3 deleted file mode 100755 index 80467cba72..0000000000 --- a/crit/crit-python3 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -from pycriu import cli - -if __name__ == '__main__': - cli.main() diff --git a/crit/crit/__init__.py b/crit/crit/__init__.py new file mode 100644 index 0000000000..58f3ace6c0 --- /dev/null +++ b/crit/crit/__init__.py @@ -0,0 +1 @@ +from .version import __version__ diff --git a/lib/py/cli.py b/crit/crit/__main__.py similarity index 94% rename from lib/py/cli.py rename to crit/crit/__main__.py index 5419384c3d..bce5234456 100755 --- a/lib/py/cli.py +++ b/crit/crit/__main__.py @@ -1,18 +1,17 @@ -from __future__ import print_function +#!/usr/bin/env python3 import argparse import sys import json import os import pycriu +from . import __version__ def inf(opts): if opts['in']: return open(opts['in'], 'rb') else: - if (sys.version_info < (3, 0)): - return sys.stdin if sys.stdin.isatty(): # If we are reading from a terminal (not a pipe) we want text input and not binary return sys.stdin @@ -28,8 +27,6 @@ def outf(opts, decode): mode = 'w+' return open(opts['out'], mode) else: - if (sys.version_info < (3, 0)): - return sys.stdout if decode: return sys.stdout return sys.stdout.buffer @@ -45,9 +42,9 @@ def decode(opts): try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: - print("Unknown magic %#x.\n"\ - "Maybe you are feeding me an image with "\ - "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) + print("Unknown magic %#x.\n" + "Maybe you are feeding me an image with " + "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) sys.exit(1) if opts['pretty']: @@ -63,9 +60,9 @@ def encode(opts): try: img = json.load(inf(opts)) except UnicodeDecodeError: - print("Cannot read JSON.\n"\ - "Maybe you are feeding me an image with protobuf data? "\ - "Encode expects JSON input.", file=sys.stderr) + print("Cannot read JSON.\n" + "Maybe you are feeding me an image with protobuf data? " + "Encode expects JSON input.", file=sys.stderr) sys.exit(1) pycriu.images.dump(img, outf(opts, False)) @@ -135,7 +132,7 @@ def ftype_find_in_files(opts, ft, fid): if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] - except: + except Exception: files_img = [] if len(files_img) == 0: @@ -326,12 +323,12 @@ def explore_rss(opts): pvmi = -1 for pm in pms[1:]: pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) - while vmas[vmi]['end'] <= pm['vaddr']: + while vmi < len(vmas) and vmas[vmi]['end'] <= pm['vaddr']: vmi += 1 pme = pm['vaddr'] + (pm['nr_pages'] << 12) vstr = '' - while vmas[vmi]['start'] < pme: + while vmi < len(vmas) and vmas[vmi]['start'] < pme: vma = vmas[vmi] if vmi == pvmi: vstr += ' ~' @@ -368,6 +365,7 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--version', action='version', version=__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') @@ -377,8 +375,7 @@ def main(): 'decode', help='convert criu image from binary type to json') decode_parser.add_argument( '--pretty', - help= - 'Multiline with indents and some numerical fields in field-specific format', + help='Multiline with indents and some numerical fields in field-specific format', action='store_true') decode_parser.add_argument( '-i', diff --git a/crit/pycriu b/crit/pycriu deleted file mode 120000 index d13a8790a9..0000000000 --- a/crit/pycriu +++ /dev/null @@ -1 +0,0 @@ -../lib/py/ \ No newline at end of file diff --git a/crit/pyproject.toml b/crit/pyproject.toml index b1e1a4650a..9089f0a394 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -1,2 +1,22 @@ [build-system] requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "crit" +description = "CRiu Image Tool" +authors = [ + {name = "CRIU team", email = "criu@openvz.org"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[project.scripts] +crit = "crit.__main__:main" + +[tool.setuptools] +packages = ["crit"] + +[tool.setuptools.dynamic] +version = {attr = "crit.__version__"} diff --git a/crit/setup.cfg b/crit/setup.cfg new file mode 100644 index 0000000000..fbc9a51439 --- /dev/null +++ b/crit/setup.cfg @@ -0,0 +1,20 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = crit +description = CRiu Image Tool +author = CRIU team +author_email = criu@openvz.org +license = GPLv2 +version = attr: crit.__version__ + +[options] +packages = crit +python_requires = >=3.6 + +[options.entry_points] +console_scripts = + crit = crit.__main__:main diff --git a/crit/setup.py b/crit/setup.py index 1aaa73a130..618ac1de48 100644 --- a/crit/setup.py +++ b/crit/setup.py @@ -1,29 +1,6 @@ -import os -from setuptools import setup, find_packages +#!/usr/bin/env python3 +import setuptools -def get_version(): - version = '0.0.1' - env = os.environ - if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: - version = '{}.{}'.format( - env['CRIU_VERSION_MAJOR'], - env['CRIU_VERSION_MINOR'] - ) - if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: - version += '.' + env['CRIU_VERSION_SUBLEVEL'] - return version - - -setup( - name='crit', - version=get_version(), - description='CRiu Image Tool', - author='CRIU team', - author_email='criu@openvz.org', - license='GPLv2', - url='https://github.com/checkpoint-restore/criu', - packages=find_packages('.'), - scripts=['crit'], - install_requires=[], -) +if __name__ == '__main__': + setuptools.setup() diff --git a/criu/Makefile b/criu/Makefile index 55bdb1b7a3..bafdd980bb 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -85,7 +85,7 @@ $(obj)/%: pie $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) $(GMONLDOPT) -rdynamic -o $@ UNIT-BUILTINS += $(obj)/util.o UNIT-BUILTINS += $(obj)/config.o @@ -102,7 +102,7 @@ $(obj)/unittest/built-in.o: .FORCE $(obj)/unittest/unittest: $(UNIT-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) -rdynamic -o $@ unittest: $(obj)/unittest/unittest $(Q) $(obj)/unittest/$@ @@ -145,10 +145,8 @@ install: $(obj)/criu $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts -ifeq ($(PYTHON),python3) $(E) " INSTALL " scripts/criu-ns $(Q) install -m 755 scripts/criu-ns $(DESTDIR)$(SBINDIR) -endif .PHONY: install uninstall: diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index f586449172..ba6132d2f7 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -92,6 +92,8 @@ obj-y += servicefd.o obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o +obj-y += timer.o +obj-y += sigact.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 @@ -99,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 13c346f449..7f6113c8f1 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,7 +6,6 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel -REQ-RPM-PKG-NAMES += $(PYTHON)-future REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -15,20 +14,16 @@ REQ-DEB-PKG-NAMES += libprotobuf-c-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf -REQ-DEB-PKG-NAMES += $(PYTHON)-future REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev -ifeq ($(PYTHON),python3) REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -else -REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml -endif + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 1ce6d9c108..6f79001864 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -18,6 +18,7 @@ #include "common/scm.h" static const char *action_names[ACT_MAX] = { + [ACT_PRE_STREAM] = "pre-stream", [ACT_PRE_DUMP] = "pre-dump", [ACT_POST_DUMP] = "post-dump", [ACT_PRE_RESTORE] = "pre-restore", @@ -30,6 +31,7 @@ static const char *action_names[ACT_MAX] = { [ACT_POST_RESUME] = "post-resume", [ACT_ORPHAN_PTS_MASTER] = "orphan-pts-master", [ACT_STATUS_READY] = "status-ready", + [ACT_QUERY_EXT_FILES] = "query-ext-files", }; struct script { @@ -51,6 +53,9 @@ static int run_shell_scripts(const char *action) #define ENV_IMGDIR 0x1 #define ENV_ROOTPID 0x2 + if (list_empty(&scripts)) + return 0; + if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); return -1; @@ -111,6 +116,20 @@ int rpc_send_fd(enum script_actions act, int fd) return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } +int rpc_query_external_files(void) +{ + int rpc_sk; + + if (scripts_mode != SCRIPTS_RPC) + return 0; + + rpc_sk = get_service_fd(RPC_SK_OFF); + if (rpc_sk < 0) + return -1; + + return exec_rpc_query_external_files((char *)action_names[ACT_QUERY_EXT_FILES], rpc_sk); +} + int run_scripts(enum script_actions act) { int ret = 0; @@ -118,23 +137,24 @@ int run_scripts(enum script_actions act) pr_debug("Running %s scripts\n", action); - if (scripts_mode == SCRIPTS_NONE) + switch (scripts_mode) { + case SCRIPTS_NONE: return 0; - - if (scripts_mode == SCRIPTS_RPC) { + case SCRIPTS_RPC: ret = rpc_send_fd(act, -1); - goto out; - } - - if (scripts_mode == SCRIPTS_SHELL) { + if (ret) + break; + /* Enable scripts from config file in RPC mode (fallthrough) */ + case SCRIPTS_SHELL: ret = run_shell_scripts(action); - goto out; + break; + default: + BUG(); } - BUG(); -out: if (ret) pr_err("One of more action scripts failed\n"); + return ret; } @@ -142,8 +162,9 @@ int add_script(char *path) { struct script *script; - BUG_ON(scripts_mode == SCRIPTS_RPC); - scripts_mode = SCRIPTS_SHELL; + /* Set shell mode when a script is added but don't overwrite RPC mode */ + if (scripts_mode == SCRIPTS_NONE) + scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) @@ -169,7 +190,6 @@ int add_rpc_notify(int sk) return -1; } - BUG_ON(scripts_mode == SCRIPTS_SHELL); scripts_mode = SCRIPTS_RPC; if (install_service_fd(RPC_SK_OFF, fd) < 0) diff --git a/criu/apparmor.c b/criu/apparmor.c index 9de54ce40b..48b639216a 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -207,8 +207,6 @@ static int by_time(const struct dirent **de1, const struct dirent **de2) } else { if (sb1.st_mtim.tv_sec < sb2.st_mtim.tv_sec) return -1; - if (sb1.st_mtim.tv_sec == sb2.st_mtim.tv_sec) - return 0; return 1; } } @@ -471,6 +469,7 @@ static void *get_suspend_policy(char *name, off_t *len) ret = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (ret == MAP_FAILED) { pr_perror("mmap of %s failed", file); + ret = NULL; goto out; } @@ -551,8 +550,8 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit goto fail; } - ret = snprintf(path + offset + my_offset, sizeof(path) - offset - my_offset, "/.replace"); - if (ret < 0 || ret >= sizeof(path) - offset - my_offset) { + ret = snprintf(path + offset + my_offset, PATH_MAX - offset - my_offset, "/.replace"); + if (ret < 0 || ret >= PATH_MAX - offset - my_offset) { pr_err("snprintf failed\n"); goto fail; } diff --git a/criu/arch/loongarch64/Makefile b/criu/arch/loongarch64/Makefile new file mode 100644 index 0000000000..4bd99eb7eb --- /dev/null +++ b/criu/arch/loongarch64/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +ccflags-y += -iquote $(obj)/include +ccflags-y += -iquote criu/include -iquote include +ccflags-y += $(COMPEL_UAPI_INCLUDES) + +asflags-y += -Wstrict-prototypes +asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/criu/arch/loongarch64/cpu.c b/criu/arch/loongarch64/cpu.c new file mode 100644 index 0000000000..5559c4288f --- /dev/null +++ b/criu/arch/loongarch64/cpu.c @@ -0,0 +1,31 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + return 0; +} diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c new file mode 100644 index 0000000000..eeb0731ca6 --- /dev/null +++ b/criu/arch/loongarch64/crtools.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "log.h" +#include "asm/restorer.h" +#include "asm/parasite-syscall.h" +#include +#include "asm/dump.h" +#include "cr_options.h" +#include "common/compiler.h" +#include "restorer.h" +#include "parasite-syscall.h" +#include "util.h" +#include "cpu.h" +#include +#include "kerndat.h" + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + +#define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + int i; + CoreEntry *core = x; + UserLoongarch64GpregsEntry *gprs = core->ti_loongarch64->gpregs; + UserLoongarch64FpregsEntry *fprs = core->ti_loongarch64->fpregs; + for (i = 0; i < GPR_NUM; i++) + assign_reg(gprs, regs, regs[i]); + assign_reg(gprs, regs, pc); + + for (i = 0; i < FPR_NUM; i++) + assign_reg(fpregs, fpregs, regs[i]); + assign_reg(fprs, fpregs, fcc); + assign_reg(fprs, fpregs, fcsr); + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoLoongarch64 *ti_loongarch64; + UserLoongarch64GpregsEntry *gpregs; + UserLoongarch64FpregsEntry *fpregs; + + ti_loongarch64 = xmalloc(sizeof(*ti_loongarch64)); + thread_info_loongarch64__init(ti_loongarch64); + core->ti_loongarch64 = ti_loongarch64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_loongarch64_gpregs_entry__init(gpregs); + gpregs->n_regs = GPR_NUM; + gpregs->regs = xmalloc(GPR_NUM * sizeof(uint64_t)); + if (!gpregs->regs) + goto err; + ti_loongarch64->gpregs = gpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + goto err; + user_loongarch64_fpregs_entry__init(fpregs); + fpregs->n_regs = FPR_NUM; + fpregs->regs = xmalloc(FPR_NUM * sizeof(uint64_t)); + if (!fpregs->regs) + goto err; + ti_loongarch64->fpregs = fpregs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpregs) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + fpu_context_t *fpu = RT_SIGFRAME_FPU(sigframe); + UserLoongarch64FpregsEntry *fpregs = core->ti_loongarch64->fpregs; + + memcpy(fpu->regs, fpregs->regs, sizeof(fpu->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int restore_gpregs(struct rt_sigframe *sigframe, UserRegsEntry *r) +{ + sigcontext_t *sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, r->regs, sizeof(sc->regs)); + sc->pc = r->pc; + return 0; +} diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h new file mode 100644 index 0000000000..04347155c3 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_loongarch64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/int.h b/criu/arch/loongarch64/include/asm/int.h new file mode 100644 index 0000000000..642804e9b4 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/kerndat.h b/criu/arch/loongarch64/include/asm/kerndat.h new file mode 100644 index 0000000000..bb70cf6cf5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/parasite-syscall.h b/criu/arch/loongarch64/include/asm/parasite-syscall.h new file mode 100644 index 0000000000..6008c37923 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/loongarch64/include/asm/parasite.h b/criu/arch/loongarch64/include/asm/parasite.h new file mode 100644 index 0000000000..b64cb3185c --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite.h @@ -0,0 +1,11 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm volatile("or %0, $zero, $tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/loongarch64/include/asm/restore.h b/criu/arch/loongarch64/include/asm/restore.h new file mode 100644 index 0000000000..d956231c81 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restore.h @@ -0,0 +1,33 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ +({ \ + uint64_t save_sp; \ + asm volatile("or %0, $zero, $sp" : "=r"(save_sp) : :"memory"); \ + asm volatile( \ + "or $a0, $zero, %2 \n" \ + "or $sp, $zero, %0 \n" \ + "jirl $ra, %1, 0 \n" \ + : \ + : "r"(new_sp & ~15), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "$a0", "memory"); \ + asm volatile("or $sp, $zero, %0" : : "r"(save_sp) : "memory"); \ +}) + +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_loongarch64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/loongarch64/include/asm/restorer.h b/criu/arch/loongarch64/include/asm/restorer.h new file mode 100644 index 0000000000..7a0d35c5b5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restorer.h @@ -0,0 +1,97 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include + +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld.d $a1, %2 \n" \ + "addi.d $a1, $a1, -16 \n" \ + "st.d %5, $a1, 0 \n" \ + "st.d %6, $a1, 8 \n" \ + "or $a0, $zero, %1 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone_end \n" \ + \ + "thread_run: \n" \ + "ld.d $a1, $sp, 0 \n" \ + "ld.d $a0, $sp, 8 \n" \ + "jirl $ra, $a1, 0 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "ZB"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(&clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + "or $a0, $zero, %1 \n" \ + "or $a1, $zero, %2 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone3)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, clone3_thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + "or $a0, $zero, $a3 \n" \ + "jirl $ra, $a2, 0 \n" \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") +/* clang-format on */ + +static inline void restore_tls(tls_t *ptls) +{ + asm volatile("or $tp, $zero, %0" : : "r"(*ptls)); +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +int restore_gpregs(struct rt_sigframe *f, UserLoongarch64GpregsEntry *r); +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r); + +#define arch_map_vdso(map, compat) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/thread_pointer.h b/criu/arch/loongarch64/include/asm/thread_pointer.h new file mode 100644 index 0000000000..f7e07066a5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/loongarch64/include/asm/types.h b/criu/arch/loongarch64/include/asm/types.h new file mode 100644 index 0000000000..72bca2022b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/types.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" +#include "images/core.pb-c.h" + +#include + +#define core_is_compat(core) false + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__LOONGARCH64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_loongarch64 + +#define TI_SP(core) ((core)->ti_loongarch64->gpregs->regs[4]) + +#define TI_IP(core) ((core)->ti_loongarch64->gpregs->pc) + +typedef UserLoongarch64GpregsEntry UserRegsEntry; + +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/loongarch64/include/asm/vdso.h b/criu/arch/loongarch64/include/asm/vdso.h new file mode 100644 index 0000000000..64631dee09 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/vdso.h @@ -0,0 +1,27 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 5 +#define VDSO_SYMBOL_GTOD 3 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_getcpu"; \ + const char *aarch_vdso_symbol2 = "__vdso_clock_getres"; \ + const char *aarch_vdso_symbol3 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol4 = "__vdso_gettimeofday"; \ + const char *aarch_vdso_symbol5 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5 +#endif diff --git a/criu/arch/loongarch64/restorer.c b/criu/arch/loongarch64/restorer.c new file mode 100644 index 0000000000..730318ac14 --- /dev/null +++ b/criu/arch/loongarch64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include "log.h" +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r) +{ + return 0; +} diff --git a/criu/arch/loongarch64/sigframe.c b/criu/arch/loongarch64/sigframe.c new file mode 100644 index 0000000000..18983ff138 --- /dev/null +++ b/criu/arch/loongarch64/sigframe.c @@ -0,0 +1,12 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/loongarch64/vdso-pie.c b/criu/arch/loongarch64/vdso-pie.c new file mode 100644 index 0000000000..7a75d2741d --- /dev/null +++ b/criu/arch/loongarch64/vdso-pie.c @@ -0,0 +1,48 @@ +#include +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t pcaddi; + uint32_t ldptr; + uint32_t jirl; + uint32_t guards; + uint64_t imm64; + } __packed jmp = { + .pcaddi = 0x18000095, /* pcaddi $x, 4 */ + .ldptr = 0x260002b5, /* ldptr.d $x, $x, 0 */ + .jirl = 0x4c0002a0, /* jirl $zero, $x, 0 */ + .guards = 0x002a0000, /* break 0 */ + .imm64 = to, + }; + memcpy((void *)from, &jmp, sizeof(jmp)); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) +{ + unsigned int i; + unsigned long from, to; + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + insert_trampoline(from, to); + } + return 0; +} diff --git a/criu/arch/x86/Makefile b/criu/arch/x86/Makefile index 618e85bb3e..46f00e9e93 100644 --- a/criu/arch/x86/Makefile +++ b/criu/arch/x86/Makefile @@ -9,6 +9,7 @@ obj-y += cpu.o obj-y += crtools.o obj-y += kerndat.o obj-y += sigframe.o +obj-y += shstk.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o endif diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index 912a4348b9..e068a9a020 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -133,6 +133,14 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpre #undef assign_array #undef assign_xsave + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + UserX86CetEntry *cet = core->thread_info->fpregs->xsave->cet; + struct cet_user_state *regs = &fpregs->cet; + + cet->cet = regs->cet; + cet->ssp = regs->ssp; + } + return 0; } @@ -199,6 +207,13 @@ static int alloc_xsave_extends(UserX86XsaveEntry *xsave) goto err; } + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + xsave->cet = xzalloc(sizeof(UserX86CetEntry)); + if (!xsave->cet) + goto err; + user_x86_cet_entry__init(xsave->cet); + } + return 0; err: return -1; @@ -220,6 +235,8 @@ int arch_alloc_thread_info(CoreEntry *core) with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); if (with_xsave) sz += sizeof(UserX86XsaveEntry); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) + sz += sizeof(UserX86CetEntry); } m = xmalloc(sz); diff --git a/criu/arch/x86/include/asm/kerndat.h b/criu/arch/x86/include/asm/kerndat.h index 903bc80f7c..5c37172302 100644 --- a/criu/arch/x86/include/asm/kerndat.h +++ b/criu/arch/x86/include/asm/kerndat.h @@ -4,5 +4,6 @@ extern int kdat_compatible_cr(void); extern int kdat_can_map_vdso(void); extern int kdat_x86_has_ptrace_fpu_xsave_bug(void); +extern int kdat_has_shstk(void); #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index f7a6d50589..3a673958d1 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -8,6 +8,7 @@ #include #include #include "asm/compat.h" +#include "asm/shstk.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h new file mode 100644 index 0000000000..7814c351d1 --- /dev/null +++ b/criu/arch/x86/include/asm/shstk.h @@ -0,0 +1,272 @@ +#ifndef __CR_ASM_SHSTK_H__ +#define __CR_ASM_SHSTK_H__ + +/* + * Shadow stack constants from Linux + */ +/* arch/x86/include/uapi/asm/mman.h */ +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +/* arch/x86/include/uapi/asm/prctl.h */ +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + +#define ARCH_HAS_SHSTK + +/* from arch/x86/kernel/shstk.c */ +#define SHSTK_DATA_BIT (1UL << 63) /* BIT(63) */ + +/* + * Shadow stack memory cannot be restored with memcpy/pread but only using + * a special instruction that can write to shadow stack. + * That instruction is only available when shadow stack is enabled, + * otherwise it causes #UD. + * + * Also, shadow stack VMAs cannot be mmap()ed or mrepmap()ed, they must be + * created using map_shadow_stack() system call. This pushes creation of + * shadow stack VMAs to the restorer blob after CRIU mappings are freed. + * + * And there is an additional jungling with shadow stacks to ensure that we + * don't unmap an active shadow stack + * + * The overall sequence of restoring shadow stack is + * - Enable shadow stack early after clone()ing the task + * - Unlock shadow stack features using ptrace + * - In the restorer blob: + * - switch to a temporary shadow stack to be able to unmap shadow stack + * with the CRIU mappings + * - after memory mappigns are restored, recreate shadow stack VMAs, + * populate them using wrss instruction and switch to the task shadow + * stack + * - lock shadow stack features + */ +struct rst_shstk_info { + unsigned long vma_start; /* start of shadow stack VMA */ + unsigned long vma_size; /* size of shadow stack VMA */ + unsigned long premmaped_addr; /* address of shadow stack copy in + the premmaped area */ + unsigned long tmp_shstk; /* address of temporary shadow stack */ + u64 ssp; /* shadow stack pointer */ + u64 cet; /* CET conrtol state */ +}; +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_shstk_prepare + +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); +#define arch_shstk_unlock arch_shstk_unlock + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline + +#ifdef CR_NOGLIBC + +#include +#include +#include "vma.h" + +#define SHSTK_BUSY_BIT (1UL << 0) /* BIT(0) */ + +static inline int shstk_map(unsigned long addr, unsigned long size) +{ + long shstk = sys_map_shadow_stack(addr, size, SHADOW_STACK_SET_TOKEN); + + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", addr, shstk); + return -1; + } + + if (shstk != addr) { + pr_err("Shadow stack address mismatch: need %lx, got %lx\n", addr, shstk); + return -1; + } + + pr_info("Created shadow stack at %lx\n", shstk); + + return 0; +} + +/* clang-format off */ +static inline unsigned long get_ssp(void) +{ + unsigned long ssp; + + asm volatile("rdsspq %0" : "=r"(ssp) :: ); + + return ssp; +} + +static inline void wrssq(unsigned long addr, unsigned long val) +{ + asm volatile("wrssq %1, (%0)" :: "r"(addr), "r"(val) : "memory"); +} +/* clang-format off */ + +static always_inline void shstk_switch_ssp(unsigned long new_ssp) +{ + unsigned long old_ssp = get_ssp(); + + asm volatile("rstorssp (%0)\n" :: "r"(new_ssp)); + asm volatile("saveprevssp"); + + pr_debug("changed ssp from %lx to %lx\n", old_ssp, new_ssp); +} + +/* + * Disable writes to the shadow stack and lock it's disable/enable control + */ +static inline int shstk_finalize(void) +{ + int ret = 0; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return ret; +} + +/* + * Restore contents of the shadow stack and set shadow stack pointer + */ +static always_inline int shstk_restore(struct rst_shstk_info *cet) +{ + unsigned long *shstk_data = (unsigned long *)cet->premmaped_addr; + unsigned long ssp = cet->vma_start + cet->vma_size - 8; + unsigned long shstk_top = cet->vma_size / 8 - 1; + unsigned long val; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + if (shstk_map(cet->vma_start, cet->vma_size)) + return -1; + + /* + * Switch shadow stack from temporary location to the actual task's + * shadow stack VMA + */ + shstk_switch_ssp(ssp); + + /* restore shadow stack contents */ + for (; ssp >= cet->ssp; ssp -= 8, shstk_top--) + wrssq(ssp, shstk_data[shstk_top]); + + /* + * Add tokens for sigreturn frame and for switch of the shadow stack. + * The sigreturn token will be checked by the kernel during + * processing of sigreturn + * The token for stack switch is required by rstorssp and + * saveprevssp semantics + */ + + /* token for sigreturn frame */ + val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; + wrssq(ssp, val); + + /* shadow stack switch token */ + val = ssp | SHSTK_BUSY_BIT; + ssp -= 8; + wrssq(ssp, val); + + /* reset shadow stack pointer to the proper location */ + shstk_switch_ssp(ssp); + + ret = sys_munmap(shstk_data, cet->vma_size + PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + return shstk_finalize(); +} +#define arch_shstk_restore shstk_restore + +/* + * Disable shadow stack + */ +static inline int shstk_disable(void) +{ + int ret; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); + if (ret) { + pr_err("Failed to disable shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return 0; +} + +/* + * Switch to temporary shadow stack + */ +static always_inline int shstk_switch_to_restorer(struct rst_shstk_info *cet) +{ + unsigned long ssp; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + ret = sys_munmap((void *)cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap area for temporary shadow stack\n"); + return -1; + } + + ret = shstk_map(cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) + return -1; + + /* + * Switch shadow stack from the default created by the kernel to a + * temporary shadow stack allocated in the premmaped area + */ + ssp = cet->tmp_shstk + PAGE_SIZE - 8; + shstk_switch_ssp(ssp); + + ret = sys_arch_prctl(ARCH_SHSTK_ENABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to enable writes to shadow stack\n"); + return ret; + } + + return 0; +} +#define arch_shstk_switch_to_restorer shstk_switch_to_restorer + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h index 3b3f292bde..ca46374a55 100644 --- a/criu/arch/x86/include/asm/vdso.h +++ b/criu/arch/x86/include/asm/vdso.h @@ -12,7 +12,7 @@ * This is a minimal amount of symbols * we should support at the moment. */ -#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_MAX 7 #define VDSO_SYMBOL_GTOD 2 /* @@ -42,11 +42,12 @@ const char *aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ const char *aarch_vdso_symbol4 = "__vdso_time"; \ const char *aarch_vdso_symbol5 = "__kernel_sigreturn"; \ - const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; + const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; \ + const char *aarch_vdso_symbol7 = "__vdso_clock_gettime64"; \ #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ - aarch_vdso_symbol6 + aarch_vdso_symbol6, aarch_vdso_symbol7 /* "__kernel_vsyscall", */ diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c index a98797d39f..3a58bbea7a 100644 --- a/criu/arch/x86/kerndat.c +++ b/criu/arch/x86/kerndat.c @@ -17,6 +17,7 @@ #include "asm/compat.h" #include "asm/dump.h" +#include "asm/shstk.h" int kdat_can_map_vdso(void) { @@ -251,3 +252,29 @@ int kdat_x86_has_ptrace_fpu_xsave_bug(void) return ret; } + +/* + * Unlike most kerndat knobs, this does not check for availability of the + * shadow stack in the kernel, but rather checks if criu runs with shadow + * stack enabled. + * + * This depends on hardware availability, kernel and glibc support, compiler + * options and glibc tunables. + */ +int kdat_has_shstk(void) +{ + unsigned long features; + + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return 0; + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_STATUS, &features)) { + /* kernels that don't support shadow stack return -EINVAL */ + if (errno == EINVAL) + return 0; + pr_perror("Cannot get shadow stack status"); + return 1; + } + + return !!(features & ARCH_SHSTK_SHSTK); +} diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c new file mode 100644 index 0000000000..b752f114a8 --- /dev/null +++ b/criu/arch/x86/shstk.c @@ -0,0 +1,223 @@ +#include +#include + +#include + +#include + +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +static bool task_needs_shstk(struct pstree_item *item, CoreEntry *core) +{ + UserX86FpregsEntry *fpregs; + + if (!task_alive(item)) + return false; + + fpregs = core->thread_info->fpregs; + if (fpregs->xsave && fpregs->xsave->cet) { + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + pr_warn_once("Restoring task with shadow stack on non-CET machine\n"); + return false; + } + + if (fpregs->xsave->cet->cet & ARCH_SHSTK_SHSTK) + return true; + } + + return false; +} + +static int shstk_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *shstk) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, shstk->ssp)) { + unsigned long premmaped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + shstk->vma_start = vma->e->start; + shstk->vma_size = size; + shstk->premmaped_addr = premmaped_addr; + shstk->tmp_shstk = premmaped_addr + size; + + break; + } + } + + return 0; +} + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + UserX86FpregsEntry *fpregs = core->thread_info->fpregs; + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *shstk = &ta->shstk; + int i; + + if (!task_needs_shstk(item, core)) + return 0; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + fpregs = core->thread_info->fpregs; + shstk = &thread_args->shstk; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + } + + return 0; +} + +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid) +{ + unsigned long features; + int status; + int ret = -1; + + /* + * CRIU runs with no shadow stack and the task does not need one, + * nothing to do. + */ + if (!kdat.has_shstk && !task_needs_shstk(item, core)) + return 0; + + futex_wait_until(&rsti(item)->shstk_enable, 1); + + if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { + pr_perror("Cannot attach to %d", pid); + goto futex_wake; + } + + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Cannot interrupt the %d task", pid); + goto detach; + } + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("waitpid(%d) failed", pid); + goto detach; + } + + features = ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS; + if (ptrace(PTRACE_ARCH_PRCTL, pid, features, ARCH_SHSTK_UNLOCK)) { + pr_perror("Cannot unlock CET for %d task", pid); + goto detach; + } + +detach: + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { + pr_perror("Unable to detach %d", pid); + goto futex_wake; + } + + ret = 0; + +futex_wake: + futex_set_and_wake(&rsti(item)->shstk_unlock, 1); + + return ret; +} + +static void shstk_sync_unlock(struct pstree_item *item) +{ + /* notify parent that shadow stack is enabled ... */ + futex_set_and_wake(&rsti(item)->shstk_enable, 1); + + /* ... and wait until it unlocks its features with ptrace */ + futex_wait_until(&rsti(item)->shstk_unlock, 1); +} + +static void __arch_shstk_enable(struct pstree_item *item, + int (*func)(void *arg), void *arg) +{ + int ret; + + shstk_sync_unlock(item); + + /* return here would cause #CP, use exit() instead */ + ret = func(arg); + exit(ret); +} + +static int shstk_disable(struct pstree_item *item) +{ + shstk_sync_unlock(item); + + /* disable shadow stack, implicitly clears ARCH_SHSTK_WRSS */ + if (syscall(__NR_arch_prctl, ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK)) { + pr_perror("Failed to disable shadow stack"); + return -1; + } + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_LOCK, + ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS)) { + pr_perror("Failed to lock shadow stack controls"); + return -1; + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + unsigned long features = ARCH_SHSTK_SHSTK; + int code = ARCH_SHSTK_ENABLE; + + /* + * If task does not need shadow stack but CRIU runs with shadow + * stack enabled, we should disable it before continuing with + * restore + */ + if (!task_needs_shstk(item, core)) { + if (kdat.has_shstk && shstk_disable(item)) + return -1; + return func(arg); + } + + /* + * Calling sys_arch_prctl() means there will be use of retq + * instruction after shadow stack is enabled and this will cause + * Control Protectiond fault. Open code sys_arch_prctl() in + * assembly. + * + * code and addr should be in %rdi and %rsi and will be passed to + * the system call as is. + */ + asm volatile("movq $"__stringify(__NR_arch_prctl)", %%rax \n" + "syscall \n" + "cmpq $0, %%rax \n" + "je 1f \n" + "retq \n" + "1: \n" + :: "D"(code), "S"(features)); + + __arch_shstk_enable(item, func, arg); + + /* never reached */ + return -1; +} diff --git a/criu/arch/x86/sigframe.c b/criu/arch/x86/sigframe.c index 4fa7eb3dc9..46612e70d3 100644 --- a/criu/arch/x86/sigframe.c +++ b/criu/arch/x86/sigframe.c @@ -23,7 +23,7 @@ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *r } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; - } else if (!sigframe->is_native) { + } else { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; if ((addr % 64ul)) { diff --git a/criu/autofs.c b/criu/autofs.c index 6a7d8db0df..a1775cbc96 100644 --- a/criu/autofs.c +++ b/criu/autofs.c @@ -658,7 +658,7 @@ static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) static int autofs_mnt_set_timeout(time_t timeout, const char *mnt_path, int mnt_fd) { - pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); + pr_info("%s: set timeout %" PRId64 " for %s\n", __func__, (int64_t)timeout, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); } @@ -770,7 +770,7 @@ static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout } if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { - pr_err("Failed to set timeout %ld for %s\n", timeout, mnt_path); + pr_err("Failed to set timeout %" PRId64 " for %s\n", (int64_t)timeout, mnt_path); return -1; } diff --git a/criu/cgroup.c b/criu/cgroup.c index 8243ac6d3c..fcaed07080 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -28,7 +28,6 @@ #include "images/cgroup.pb-c.h" #include "kerndat.h" #include "linux/mount.h" -#include "syscall.h" /* * This structure describes set of controller groups @@ -427,10 +426,11 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const } /* - * Set the is_threaded flag if cgroup.type's value is threaded, - * ignore all other values. + * Set the is_threaded flag if cgroup.type's value is threaded + * or it is a cgroup v1 (it has a 'tasks' property). + * Ignore all other values. */ - if (!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) + if ((!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) || !strcmp("tasks", prop->name)) controller->is_threaded = true; pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); @@ -580,14 +580,15 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) int fsfd, fd; char *name; - fsfd = sys_fsopen(fstype, 0); + fsfd = cr_fsopen(fstype, 0); if (fsfd < 0) { pr_perror("Unable to open the cgroup file system"); return -1; } if (strstartswith(cc->name, namestr)) { - if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to configure the cgroup (%s) file system", cc->name); goto err; } @@ -595,7 +596,8 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) char *saveptr = NULL, *buf = strdupa(cc->name); name = strtok_r(buf, ",", &saveptr); while (name) { - if (sys_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to configure the cgroup (%s) file system", name); goto err; } @@ -603,14 +605,17 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) } } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to create the cgroup (%s) file system", cc->name); goto err; } - fd = sys_fsmount(fsfd, 0, 0); - if (fd < 0) + fd = cr_fsmount(fsfd, 0, 0); + if (fd < 0) { + fsfd_dump_messages(fsfd); pr_perror("Unable to mount the cgroup (%s) file system", cc->name); + } close(fsfd); return fd; @@ -639,8 +644,8 @@ static int open_cgroupfs(struct cg_ctl *cc) return -1; } - if (mount("none", prefix, fstype, 0, mopts) < 0) { - pr_perror("Unable to mount %s", mopts); + if (mount("none", prefix, fstype, 0, mopts[0] ? mopts : NULL) < 0) { + pr_perror("Unable to mount %s %s", fstype, mopts); rmdir(prefix); return -1; } @@ -714,6 +719,8 @@ static int collect_cgroups(struct list_head *ctls) } } else { fd = open_cgroupfs(cc); + if (fd < 0) + return -1; } path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); @@ -1202,17 +1209,12 @@ static int prepare_cgns(CgSetEntry *se) return 0; } -static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) +static int move_in_cgroup(CgSetEntry *se) { int i; pr_info("Move into %d\n", se->id); - if (setup_cgns && prepare_cgns(se) < 0) { - pr_err("failed preparing cgns\n"); - return -1; - } - for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; @@ -1252,7 +1254,44 @@ static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) return 0; } -int prepare_task_cgroup(struct pstree_item *me) +int prepare_cgroup_namespace(struct pstree_item *root_task) +{ + CgSetEntry *se; + + if (opts.manage_cgroups == CG_MODE_IGNORE) + return 0; + + if (root_task->parent) { + pr_err("Expecting root_task to restore cgroup namespace\n"); + return -1; + } + + /* + * If on dump all dumped tasks are in same cgset with criu we don't + * dump cgsets and thus cgroup namespaces and rely that on restore + * criu caller would prepare proper cgset/cgns for us. Also in case + * of --unprivileged we don't even have the root cgset here. + */ + if (!rsti(root_task)->cg_set || rsti(root_task)->cg_set == root_cg_set) { + pr_info("Cgroup namespace inherited from parent\n"); + return 0; + } + + se = find_rst_set_by_id(rsti(root_task)->cg_set); + if (!se) { + pr_err("No set %d found\n", rsti(root_task)->cg_set); + return -1; + } + + if (prepare_cgns(se) < 0) { + pr_err("failed preparing cgns\n"); + return -1; + } + + return 0; +} + +int restore_task_cgroup(struct pstree_item *me) { struct pstree_item *parent = me->parent; CgSetEntry *se; @@ -1284,13 +1323,7 @@ int prepare_task_cgroup(struct pstree_item *me) return -1; } - /* Since don't support nesting of cgroup namespaces, let's only set up - * the cgns (if it exists) in the init task. In the future, we should - * just check that the cgns prefix string matches for all the entries - * in the cgset, and only unshare if that's true. - */ - - return move_in_cgroup(se, !me->parent); + return move_in_cgroup(se); } void fini_cgroup(void) @@ -1309,34 +1342,6 @@ void fini_cgroup(void) cg_yard = NULL; } -static int restore_perms(int fd, const char *path, CgroupPerms *perms) -{ - struct stat sb; - - if (perms) { - if (fstat(fd, &sb) < 0) { - pr_perror("stat of property %s failed", path); - return -1; - } - - /* only chmod/chown if the perms are actually different: we aren't - * allowed to chmod some cgroup props (e.g. the read only ones), so we - * don't want to try if the perms already match. - */ - if (sb.st_mode != (mode_t)perms->mode && fchmod(fd, perms->mode) < 0) { - pr_perror("chmod of %s failed", path); - return -1; - } - - if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && fchown(fd, perms->uid, perms->gid)) { - pr_perror("chown of %s failed", path); - return -1; - } - } - - return 0; -} - static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) { char *current, *next; @@ -1434,7 +1439,7 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat return -1; } - if (restore_perms(fd, path, perms) < 0) + if (perms && cr_fchperm(fd, perms->uid, perms->gid, perms->mode) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ @@ -1758,7 +1763,7 @@ static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { - int fd, ret; + int fd, ret = 0; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { @@ -1766,7 +1771,8 @@ static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) return -1; } - ret = restore_perms(fd, path, perms); + if (perms) + ret = cr_fchperm(fd, perms->uid, perms->gid, perms->mode); close(fd); return ret; } @@ -1921,7 +1927,7 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) if (ctrl->cnames[0][0] == 0) fstype = "cgroup2"; - pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); + pr_debug("\tMaking controller dir %s (%s), type %s\n", paux, opt, fstype); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); return -1; @@ -1945,6 +1951,21 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } +static int cgroupd_unblock_sigterm(void) +{ + sigset_t unblockmask; + + sigemptyset(&unblockmask); + sigaddset(&unblockmask, SIGTERM); + + if (sigprocmask(SIG_UNBLOCK, &unblockmask, NULL)) { + pr_perror("cgroupd: can't unblock SIGTERM"); + return -1; + } + + return 0; +} + /* * If a thread is a different cgroup set than the main thread in process, * it means it is in a threaded controller. This daemon receives the cg_set @@ -1953,6 +1974,14 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) */ static int cgroupd(int sk) { + /* + * This pairs with SIGTERM in stop_cgroupd(), and ensures that cgroupd + * will receive termination signal, regardless of which signal block + * mask was inherited. + */ + if (cgroupd_unblock_sigterm()) + return -1; + pr_info("cgroud: Daemon started\n"); while (1) { @@ -1984,6 +2013,7 @@ static int cgroupd(int sk) CgMemberEntry *ce = cg_set_entry->ctls[i]; char aux[PATH_MAX]; CgControllerEntry *ctrl = NULL; + const char *format; for (j = 0; j < n_controllers; j++) { CgControllerEntry *cur = controllers[j]; @@ -2007,7 +2037,8 @@ static int cgroupd(int sk) continue; aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); - snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path); + format = ctrl->cnames[0][0] ? "/%s/tasks" : "/%s/cgroup.threads"; + snprintf(aux + aux_off, sizeof(aux) - aux_off, format, ce->path); /* * Cgroupd runs outside of the namespaces so we don't diff --git a/criu/config.c b/criu/config.c index 9f02ae9928..1322a490ab 100644 --- a/criu/config.c +++ b/criu/config.c @@ -1036,6 +1036,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, opts.network_lock_method = NETWORK_LOCK_IPTABLES; } else if (!strcmp("nftables", optarg)) { opts.network_lock_method = NETWORK_LOCK_NFTABLES; + } else if (!strcmp("skip", optarg) || !strcmp("none", optarg)) { + opts.network_lock_method = NETWORK_LOCK_SKIP; } else { pr_err("Invalid value for --network-lock: %s\n", optarg); return 1; diff --git a/criu/cr-check.c b/criu/cr-check.c index a4166f76ba..0388cbe7fe 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "../soccr/soccr.h" @@ -53,6 +54,7 @@ #include "restorer.h" #include "uffd.h" #include "linux/aio_abi.h" +#include "mount-v2.h" #include "images/inventory.pb-c.h" @@ -1086,6 +1088,8 @@ static int kerndat_tcp_repair_window(void) int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0 && errno == EAFNOSUPPORT) + sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; @@ -1380,6 +1384,203 @@ static int check_ipv6_freebind(void) return 0; } +static int check_pagemap_scan(void) +{ + if (!kdat.has_pagemap_scan) + return -1; + + return 0; +} + +/* musl doesn't have a statx wrapper... */ +struct staty { + __u32 stx_dev_major; + __u32 stx_dev_minor; + __u64 stx_ino; +}; + +static long get_file_dev_and_inode(void *addr, struct staty *stx) +{ + char buf[4096]; + FILE *mapf; + + mapf = fopen("/proc/self/maps", "r"); + if (mapf == NULL) { + pr_perror("fopen(/proc/self/maps)"); + return -1; + } + + while (fgets(buf, sizeof(buf), mapf)) { + unsigned long start, end; + uint32_t maj, min; + __u64 ino; + + if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu", + &start, &end, &maj, &min, &ino) != 5) { + pr_perror("Unable to parse: %s", buf); + return -1; + } + if (start == (unsigned long)addr) { + stx->stx_dev_major = maj; + stx->stx_dev_minor = min; + stx->stx_ino = ino; + return 0; + } + } + + pr_err("Unable to find the mapping\n"); + return -1; +} + +static int ovl_mount(void) +{ + int tmpfs, fsfd, ovl; + + fsfd = cr_fsopen("tmpfs", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen tmpfs"); + return -1; + } + + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create tmpfs mount"); + return -1; + } + + tmpfs = cr_fsmount(fsfd, 0, 0); + if (tmpfs == -1) { + pr_perror("Unable to mount tmpfs"); + return -1; + } + + close(fsfd); + + /* overlayfs can't be constructed on top of a detached mount. */ + if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Unable to attach tmpfs mount"); + return -1; + } + close(tmpfs); + + if (chdir("/tmp")) { + pr_perror("Unable to change working directory"); + return -1; + } + + if (mkdir("/tmp/w", 0755) == -1 || + mkdir("/tmp/u", 0755) == -1 || + mkdir("/tmp/l", 0755) == -1) { + pr_perror("mkdir"); + return -1; + } + + fsfd = cr_fsopen("overlay", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen overlayfs"); + return -1; + } + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { + pr_perror("Unable to configure overlayfs"); + return -1; + } + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create overlayfs"); + return -1; + } + ovl = cr_fsmount(fsfd, 0, 0); + if (ovl == -1) { + pr_perror("Unable to mount overlayfs"); + return -1; + } + + return ovl; +} + +/* + * Check that the file device and inode shown in /proc/pid/maps match values + * returned by stat(2). + */ +static int do_check_overlayfs_maps(void) +{ + struct staty stx, mstx; + struct stat st; + int ovl, fd; + void *addr; + + /* Create a new mount namespace to not care about cleaning test mounts. */ + if (unshare(CLONE_NEWNS) == -1) { + pr_warn("Unable to create a new mount namespace\n"); + return 0; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + pr_perror("Unable to remount / with MS_SLAVE"); + return -1; + } + + ovl = ovl_mount(); + if (ovl == -1) + return -1; + + fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644); + if (fd == -1) { + pr_perror("Unable to open a test file"); + return -1; + } + + addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Unable to map the test file"); + return -1; + } + + if (get_file_dev_and_inode(addr, &mstx)) + return -1; + if (fstat(fd, &st)) { + pr_perror("stat"); + return -1; + } + stx.stx_dev_major = major(st.st_dev); + stx.stx_dev_minor = minor(st.st_dev); + stx.stx_ino = st.st_ino; + + if (stx.stx_dev_major != mstx.stx_dev_major || + stx.stx_dev_minor != mstx.stx_dev_minor || + stx.stx_ino != mstx.stx_ino) { + pr_err("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n", + mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino, + stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino); + return -1; + } + + return 0; +} + +static int check_overlayfs_maps(void) +{ + pid_t pid; + int status; + + pid = fork(); + if (pid == -1) { + pr_perror("Unable to fork a child"); + return -1; + } + if (pid == 0) { + if (do_check_overlayfs_maps()) + exit(1); + exit(0); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid"); + return -1; + } + return status == 0 ? 0 : -1; +} + static int (*chk_feature)(void); /* @@ -1500,6 +1701,8 @@ int cr_check(void) ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); ret |= check_ipv6_freebind(); + ret |= check_pagemap_scan(); + ret |= check_overlayfs_maps(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1621,6 +1824,8 @@ static struct feature_list feature_list[] = { { "openat2", check_openat2 }, { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, + { "pagemap_scan", check_pagemap_scan }, + { "overlayfs_maps", check_overlayfs_maps }, { NULL, NULL }, }; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 90d763f497..1bc5d934f5 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -86,6 +86,8 @@ #include "pidfd-store.h" #include "apparmor.h" #include "asm/dump.h" +#include "timer.h" +#include "sigact.h" /* * Architectures can overwrite this function to restore register sets that @@ -157,6 +159,11 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) tc->has_sched_policy = true; tc->sched_policy = ret; + /* The reset-on-fork flag might be used in combination + * with SCHED_FIFO or SCHED_RR to reset the scheduling + * policy/priority in child processes. + */ + ret &= ~SCHED_RESET_ON_FORK; if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { ret = syscall(__NR_sched_getparam, pid, &sp); if (ret < 0) { @@ -770,6 +777,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->tc->child_subreaper = misc->child_subreaper; core->tc->has_child_subreaper = true; + if (misc->membarrier_registration_mask) { + core->tc->membarrier_registration_mask = misc->membarrier_registration_mask; + core->tc->has_membarrier_registration_mask = true; + } + ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; @@ -2023,7 +2035,6 @@ static int cr_dump_finish(int ret) if (bfd_flush_images()) ret = -1; - cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { @@ -2077,6 +2088,9 @@ static int cr_dump_finish(int ret) if (arch_set_thread_regs(root_item, true) < 0) return -1; + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); @@ -2090,6 +2104,10 @@ static int cr_dump_finish(int ret) close_image_dir(); if (ret || post_dump_ret) { + if (fault_injected(FI_DUMP_CRASH)) { + pr_info("fault: CRIU dump crashed!\n"); + abort(); + } pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); @@ -2180,6 +2198,9 @@ int cr_dump_tasks(pid_t pid) if (network_lock()) goto err; + if (rpc_query_external_files()) + goto err; + if (collect_file_locks()) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index f02e95f6d2..646300bdb8 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -79,6 +79,7 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -98,6 +99,8 @@ #include "restore.h" #include "cr-errno.h" +#include "timer.h" +#include "sigact.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -118,7 +121,6 @@ static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); /* @@ -279,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ @@ -407,268 +409,6 @@ static int populate_pid_proc(void) return 0; } -static rt_sigaction_t sigchld_act; -/* - * If parent's sigaction has blocked SIGKILL (which is non-sense), - * this parent action is non-valid and shouldn't be inherited. - * Used to mark parent_act* no more valid. - */ -static rt_sigaction_t parent_act[SIGMAX]; -#ifdef CONFIG_COMPAT -static rt_sigaction_t_compat parent_act_compat[SIGMAX]; -#endif - -static bool sa_inherited(int sig, rt_sigaction_t *sa) -{ - rt_sigaction_t *pa; - int i; - - if (current == root_item) - return false; /* XXX -- inherit from CRIU? */ - - pa = &parent_act[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_native_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); -#ifdef CONFIG_MIPS - e->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); - - memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); - memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); -#else - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); -#endif - if (sig == SIGCHLD) { - sigchld_act = act; - return 0; - } - - if (sa_inherited(sig - 1, &act)) - return 1; - - /* - * A pure syscall is used, because glibc - * sigaction overwrites se_restorer. - */ - ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); - if (ret < 0) { - pr_perror("Can't restore sigaction"); - return ret; - } - - parent_act[sig - 1] = act; - /* Mark SIGKILL blocked which makes compat sigaction non-valid */ -#ifdef CONFIG_COMPAT - parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; -#endif - - return 1; -} - -static void *stack32; - -#ifdef CONFIG_COMPAT -static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) -{ - rt_sigaction_t_compat *pa; - int i; - - if (current == root_item) - return false; - - pa = &parent_act_compat[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t_compat act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); - - if (sig == SIGCHLD) { - memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); - return 0; - } - - if (sa_compat_inherited(sig - 1, &act)) - return 1; - - if (!stack32) { - stack32 = alloc_compat_syscall_stack(); - if (!stack32) - return -1; - } - - ret = arch_compat_rt_sigaction(stack32, sig, &act); - if (ret < 0) { - pr_err("Can't restore compat sigaction: %d\n", ret); - return ret; - } - - parent_act_compat[sig - 1] = act; - /* Mark SIGKILL blocked which makes native sigaction non-valid */ - parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; - - return 1; -} -#else -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - return -1; -} -#endif - -static int prepare_sigactions_from_core(TaskCoreEntry *tc) -{ - int sig, i; - - if (tc->n_sigactions != SIGMAX - 2) { - pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); - return -1; - } - - pr_info("Restore on-core sigactions for %d\n", vpid(current)); - - for (sig = 1, i = 0; sig <= SIGMAX; sig++) { - int ret; - SaEntry *e; - bool sigaction_is_compat; - - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - e = tc->sigactions[i++]; - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - if (ret < 0) - return ret; - } - - return 0; -} - -/* Returns number of restored signals, -1 or negative errno on fail */ -static int restore_one_sigaction(int sig, struct cr_img *img, int pid) -{ - bool sigaction_is_compat; - SaEntry *e; - int ret = 0; - - BUG_ON(sig == SIGKILL || sig == SIGSTOP); - - ret = pb_read_one_eof(img, &e, PB_SIGACT); - if (ret == 0) { - if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ - pr_err("Unexpected EOF %d\n", sig); - return -1; - } - pr_warn("This format of sigacts-%d.img is deprecated\n", pid); - return -1; - } - if (ret < 0) - return ret; - - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - sa_entry__free_unpacked(e, NULL); - - return ret; -} - -static int prepare_sigactions_from_image(void) -{ - int pid = vpid(current); - struct cr_img *img; - int sig, rst = 0; - int ret = 0; - - pr_info("Restore sigacts for %d\n", pid); - - img = open_image(CR_FD_SIGACT, O_RSTR, pid); - if (!img) - return -1; - - for (sig = 1; sig <= SIGMAX; sig++) { - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - ret = restore_one_sigaction(sig, img, pid); - if (ret < 0) - break; - if (ret) - rst++; - } - - pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); - - close_image(img); - return ret; -} - -static int prepare_sigactions(CoreEntry *core) -{ - int ret; - - if (!task_alive(current)) - return 0; - - if (core->tc->n_sigactions != 0) - ret = prepare_sigactions_from_core(core->tc); - else - ret = prepare_sigactions_from_image(); - - if (stack32) { - free_compat_syscall_stack(stack32); - stack32 = NULL; - } - - return ret; -} - static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) { struct pstree_item *pi; @@ -863,6 +603,9 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a if (tc->has_child_subreaper) args->child_subreaper = tc->child_subreaper; + if (tc->has_membarrier_registration_mask) + args->membarrier_registration_mask = tc->membarrier_registration_mask; + /* loginuid value is critical to restore */ if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { ret = prepare_loginuid(tc->loginuid); @@ -879,7 +622,6 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a return 0; } -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); static int prepare_mm(pid_t pid, struct task_restore_args *args); static int restore_one_alive_task(int pid, CoreEntry *core) @@ -972,6 +714,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (setup_uffd(pid, ta)) return -1; + if (arch_shstk_prepare(current, core, ta)) + return -1; + return sigreturn_restore(pid, ta, args_len, core); } @@ -1492,6 +1237,8 @@ static inline int fork_with_pid(struct pstree_item *item) pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } + arch_shstk_unlock(item, ca.core, pid); + err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) unlock_last_pid(); @@ -1758,7 +1505,7 @@ static int create_children_and_session(void) return 0; } -static int restore_task_with_children(void *_arg) +static int __restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; pid_t pid; @@ -1827,6 +1574,13 @@ static int restore_task_with_children(void *_arg) /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; + + /* + * Since we don't support nesting of cgroup namespaces, let's + * only set up the cgns (if it exists) in the init task. + */ + if (prepare_cgroup_namespace(current) < 0) + goto err; } if (needs_prep_creds(current) && (prepare_userns_creds())) @@ -1838,7 +1592,7 @@ static int restore_task_with_children(void *_arg) * we will only move the root one there, others will * just have it inherited. */ - if (prepare_task_cgroup(current) < 0) + if (restore_task_cgroup(current) < 0) goto err; /* Restore root task */ @@ -1943,6 +1697,16 @@ static int restore_task_with_children(void *_arg) exit(1); } +static int restore_task_with_children(void *_arg) +{ + struct cr_clone_arg *arg = _arg; + struct pstree_item *item = arg->item; + CoreEntry *core = arg->core; + + return arch_shstk_trampoline(item, core, __restore_task_with_children, + arg); +} + static int attach_to_tasks(bool root_seized) { struct pstree_item *item; @@ -2461,6 +2225,11 @@ static int restore_root_task(struct pstree_item *init) } finalize_restore(); + + /* just before releasing threads we have to restore rseq_cs */ + if (restore_rseq_cs()) + pr_err("Unable to restore rseq_cs state\n"); + /* * Some external devices such as GPUs might need a very late * trigger to kick-off some events, memory notifiers and for @@ -2470,8 +2239,10 @@ static int restore_root_task(struct pstree_item *init) * mapped memory) could be done sanely once the pie code hands * over the control to master process. */ + pr_info("Run late stage hook from criu master for external devices\n"); for_each_pstree_item(item) { - pr_info("Run late stage hook from criu master for external devices\n"); + if (!task_alive(item)) + continue; ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real); /* * This may not really be an error. Only certain plugin hooks @@ -2492,10 +2263,6 @@ static int restore_root_task(struct pstree_item *init) if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); - /* just before releasing threads we have to restore rseq_cs */ - if (restore_rseq_cs()) - pr_err("Unable to restore rseq_cs state\n"); - /* Detaches from processes and they continue run through sigreturn. */ if (finalize_restore_detach()) goto out_kill_network_unlocked; @@ -2587,12 +2354,12 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) - return -1; - if (check_img_inventory(/* restore = */ true) < 0) goto err; + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; + if (init_stats(RESTORE_STATS)) goto err; @@ -2694,251 +2461,6 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he return -1; } -static inline int timeval_valid(struct timeval *tv) -{ - return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); -} - -static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) -{ - if (ie->isec == 0 && ie->iusec == 0) { - memzero_p(val); - return 0; - } - - val->it_interval.tv_sec = ie->isec; - val->it_interval.tv_usec = ie->iusec; - - if (!timeval_valid(&val->it_interval)) { - pr_err("Invalid timer interval\n"); - return -1; - } - - if (ie->vsec == 0 && ie->vusec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - val->it_value.tv_sec = ie->isec; - val->it_value.tv_usec = ie->iusec; - } else { - val->it_value.tv_sec = ie->vsec; - val->it_value.tv_usec = ie->vusec; - } - - if (!timeval_valid(&val->it_value)) { - pr_err("Invalid timer value\n"); - return -1; - } - - pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, - val->it_interval.tv_sec, val->it_interval.tv_usec); - - return 0; -} - -/* - * Legacy itimers restore from CR_FD_ITIMERS - */ - -static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) -{ - int ret = -1; - struct cr_img *img; - ItimerEntry *ie; - - if (!deprecated_ok("Itimers")) - return -1; - - img = open_image(CR_FD_ITIMERS, O_RSTR, pid); - if (!img) - return -1; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("real", ie, &args->itimers[0]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("virt", ie, &args->itimers[1]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("prof", ie, &args->itimers[2]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; -out: - close_image(img); - return ret; -} - -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) -{ - int ret = 0; - TaskTimersEntry *tte = core->tc->timers; - - if (!tte) - return prepare_itimers_from_fd(pid, args); - - ret |= decode_itimer("real", tte->real, &args->itimers[0]); - ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); - ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); - - return ret; -} - -static inline int timespec_valid(struct timespec *ts) -{ - return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); -} - -static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) -{ - pt->val.it_interval.tv_sec = pte->isec; - pt->val.it_interval.tv_nsec = pte->insec; - - if (!timespec_valid(&pt->val.it_interval)) { - pr_err("Invalid timer interval(posix)\n"); - return -1; - } - - if (pte->vsec == 0 && pte->vnsec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - pt->val.it_value.tv_sec = pte->isec; - pt->val.it_value.tv_nsec = pte->insec; - } else { - pt->val.it_value.tv_sec = pte->vsec; - pt->val.it_value.tv_nsec = pte->vnsec; - } - - if (!timespec_valid(&pt->val.it_value)) { - pr_err("Invalid timer value(posix)\n"); - return -1; - } - - pt->spt.it_id = pte->it_id; - pt->spt.clock_id = pte->clock_id; - pt->spt.si_signo = pte->si_signo; - pt->spt.it_sigev_notify = pte->it_sigev_notify; - pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); - pt->spt.notify_thread_id = pte->notify_thread_id; - pt->overrun = pte->overrun; - - return 0; -} - -static int cmp_posix_timer_proc_id(const void *p1, const void *p2) -{ - return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; -} - -static void sort_posix_timers(struct task_restore_args *ta) -{ - void *tmem; - - /* - * This is required for restorer's create_posix_timers(), - * it will probe them one-by-one for the desired ID, since - * kernel doesn't provide another API for timer creation - * with given ID. - */ - - if (ta->posix_timers_n > 0) { - tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); - qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); - } -} - -/* - * Legacy posix timers restoration from CR_FD_POSIX_TIMERS - */ - -static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) -{ - struct cr_img *img; - int ret = -1; - struct restore_posix_timer *t; - - if (!deprecated_ok("Posix timers")) - return -1; - - img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); - if (!img) - return -1; - - ta->posix_timers_n = 0; - while (1) { - PosixTimerEntry *pte; - - ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); - if (ret <= 0) - break; - - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - break; - - ret = decode_posix_timer(pte, t); - if (ret < 0) - break; - - posix_timer_entry__free_unpacked(pte, NULL); - ta->posix_timers_n++; - } - - close_image(img); - if (!ret) - sort_posix_timers(ta); - - return ret; -} - -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) -{ - int i, ret = -1; - TaskTimersEntry *tte = core->tc->timers; - struct restore_posix_timer *t; - - ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); - - if (!tte) - return prepare_posix_timers_from_fd(pid, ta); - - ta->posix_timers_n = tte->n_posix; - for (i = 0; i < ta->posix_timers_n; i++) { - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - goto out; - - if (decode_posix_timer(tte->posix[i], t)) - goto out; - } - - ret = 0; - sort_posix_timers(ta); -out: - return ret; -} - -static inline int verify_cap_size(CredsEntry *ce) -{ - return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && (ce->n_cap_prm == CR_CAP_SIZE) && - (ce->n_cap_bnd == CR_CAP_SIZE)); -} - static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; @@ -2964,7 +2486,7 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args) args->fd_exe_link = exe_fd; - args->has_thp_enabled = rsti(current)->has_thp_enabled; + args->thp_disabled = mm->has_thp_disabled && mm->thp_disabled; ret = 0; out: @@ -3038,7 +2560,7 @@ static int validate_sched_parm(struct rst_sched_param *sp) if ((sp->nice < -20) || (sp->nice > 19)) return 0; - switch (sp->policy) { + switch (sp->policy & ~SCHED_RESET_ON_FORK) { case SCHED_RR: case SCHED_FIFO: return ((sp->prio > 0) && (sp->prio < 100)); @@ -3099,7 +2621,15 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) if (!kdat.has_ptrace_get_rseq_conf) { #if defined(__GLIBC__) && defined(RSEQ_SIG) rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); + /* + * Current glibc reports the feature/active size in + * __rseq_size, not the size passed to the kernel. + * This could be 20, but older kernels expect 32 for + * the size argument even if only 20 bytes are used. + */ rseq->rseq_abi_size = __rseq_size; + if (rseq->rseq_abi_size < 32) + rseq->rseq_abi_size = 32; rseq->signature = RSEQ_SIG; #else rseq->rseq_abi_pointer = 0; @@ -3353,17 +2883,31 @@ static bool groups_match(gid_t *groups, int n_groups) return ret; } +static void copy_caps(u32 *out_caps, u32 *in_caps, int n_words) +{ + int i, cap_end; + + for (i = kdat.last_cap + 1; i < 32 * n_words; ++i) { + if (~in_caps[i / 32] & (1 << (i % 32))) + continue; + + pr_warn("Dropping unsupported capability %d > %d)\n", i, kdat.last_cap); + /* extra caps will be cleared below */ + } + + n_words = min(n_words, (kdat.last_cap + 31) / 32); + cap_end = (kdat.last_cap & 31) + 1; + memcpy(out_caps, in_caps, sizeof(*out_caps) * n_words); + if ((cap_end & 31) && n_words) + out_caps[n_words - 1] &= (1 << cap_end) - 1; + memset(out_caps + n_words, 0, sizeof(*out_caps) * (CR_CAP_SIZE - n_words)); +} + static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; - if (!verify_cap_size(ce)) { - pr_err("Caps size mismatch %d %d %d %d\n", (int)ce->n_cap_inh, (int)ce->n_cap_eff, (int)ce->n_cap_prm, - (int)ce->n_cap_bnd); - return ERR_PTR(-EINVAL); - } - this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); @@ -3451,10 +2995,10 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.groups = NULL; args->creds.lsm_profile = NULL; - memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); - memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); - memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); - memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); + copy_caps(args->cap_inh, ce->cap_inh, ce->n_cap_inh); + copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); + copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); + copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; diff --git a/criu/cr-service.c b/criu/cr-service.c index 314c309be9..adb5cedde3 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -240,6 +240,49 @@ int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) return 0; } +int exec_rpc_query_external_files(char *name, int sk) +{ + int i, ret; + CriuNotify cn = CRIU_NOTIFY__INIT; + CriuResp msg = CRIU_RESP__INIT; + CriuReq *req; + + cn.script = name; + + msg.type = CRIU_REQ_TYPE__NOTIFY; + msg.success = true; + msg.notify = &cn; + + ret = send_criu_msg_with_fd(sk, &msg, -1); + if (ret < 0) + return ret; + + ret = recv_criu_msg(sk, &req); + if (ret < 0) + return ret; + + if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { + pr_err("RPC client reported script error\n"); + return -1; + } + + ret = 0; + if (req->opts) + for (i = 0; i < req->opts->n_external; i++) { + char *key = req->opts->external[i]; + pr_info("Adding external object: %s\n", key); + if (add_external(key)) { + pr_err("Failed to add external object: %s\n", key); + ret = -1; + } + } + else + pr_info("RPC NOTIFY %s: no `opts` returned.\n", name); + + criu_req__free_unpacked(req, NULL); + return ret; +} + static char images_dir[PATH_MAX]; static int setup_opts_from_req(int sk, CriuOpts *req) @@ -339,8 +382,14 @@ static int setup_opts_from_req(int sk, CriuOpts *req) */ if (imgs_changed_by_rpc_conf) strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else + else if (req->images_dir_fd != -1) sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + else if (req->images_dir) + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + goto err; + } if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); @@ -394,6 +443,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; } else if (!opts.output) { SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); } @@ -428,6 +480,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; + if (req->has_leave_stopped && req->leave_stopped) + opts.final_state = TASK_STOPPED; + if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; @@ -520,6 +575,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) case CRIU_NETWORK_LOCK_METHOD__NFTABLES: opts.network_lock_method = NETWORK_LOCK_NFTABLES; break; + case CRIU_NETWORK_LOCK_METHOD__SKIP: + opts.network_lock_method = NETWORK_LOCK_SKIP; + break; default: goto err; } @@ -720,6 +778,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->orphan_pts_master) opts.orphan_pts_master = true; + if (req->has_display_stats) + opts.display_stats = req->display_stats; + /* Evaluate additional configuration file a second time to overwrite * all RPC settings. */ if (req->config_file) { @@ -1249,6 +1310,8 @@ int cr_service_work(int sk) int ret = -1; CriuReq *msg = 0; + util_init(); + more: opts.mode = CR_SWRK; @@ -1467,6 +1530,13 @@ int cr_service(bool daemon_mode) close(server_fd); init_opts(); + /* + * We want to have an unique criu_run_id + * here so that each service worker fork here + * can create its own sockets file descriptors + * despite being in the same network namespace. + */ + util_init(); ret = cr_service_work(sk); close(sk); exit(ret != 0); diff --git a/criu/crtools.c b/criu/crtools.c index 94657f4186..b67af0b72e 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -169,7 +169,15 @@ int main(int argc, char *argv[], char *envp[]) pr_err("unknown command: %s\n", argv[optind]); goto usage; } - + /* + * During dump, restore and parasite it's important for us + * to initialize criu_run_id and compel_run_id so that + * sockets and file descriptors are generated with an unique + * name identifying the specific process even in cases + * where multiple processes with the same pid in different + * pid namespaces are sharing the same network namespace. + */ + util_init(); if (opts.mode == CR_SWRK) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); @@ -254,8 +262,6 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - util_init(); - if (log_init(opts.output)) return 1; diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 83dc1fc8d6..2272e6d842 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -1,6 +1,7 @@ #include #include "criu-log.h" #include "fault-injection.h" +#include "seize.h" enum faults fi_strategy; @@ -21,5 +22,13 @@ int fault_injection_init(void) } fi_strategy = start; + + switch (fi_strategy) { + case FI_DISABLE_FREEZE_CGROUP: + dont_use_freeze_cgroup(); + break; + default: + break; + }; return 0; } diff --git a/criu/files-reg.c b/criu/files-reg.c index ed8b9c8899..fc61493501 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -407,46 +407,24 @@ static int mklnk_ghost(char *path, GhostFileEntry *gfe) static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; - int ret = -1; - if (S_ISLNK(gfe->mode)) { - if (lchown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (cr_fchpermat(AT_FDCWD, path, gfe->uid, gfe->gid, gfe->mode, AT_SYMLINK_NOFOLLOW) < 0) + return -1; - /* - * We have no lchmod() function, and fchmod() will fail on - * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() - * function and flag AT_SYMLINK_NOFOLLOW described in - * man 2 fchmodat, but it is not currently implemented. %) - */ - } else { - if (chown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (!gfe->atim) + return 0; - if (chmod(path, gfe->mode)) { - pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); - goto err; - } - } + tv[0].tv_sec = gfe->atim->tv_sec; + tv[0].tv_usec = gfe->atim->tv_usec; + tv[1].tv_sec = gfe->mtim->tv_sec; + tv[1].tv_usec = gfe->mtim->tv_usec; - if (gfe->atim) { - tv[0].tv_sec = gfe->atim->tv_sec; - tv[0].tv_usec = gfe->atim->tv_usec; - tv[1].tv_sec = gfe->mtim->tv_sec; - tv[1].tv_usec = gfe->mtim->tv_usec; - if (lutimes(path, tv)) { - pr_perror("Can't set access and modification times on ghost %s", path); - goto err; - } + if (lutimes(path, tv)) { + pr_perror("Can't set access and modification times on ghost %s", path); + return -1; } - ret = 0; -err: - return ret; + return 0; } static int create_ghost_dentry(char *path, GhostFileEntry *gfe, struct cr_img *img) @@ -1672,22 +1650,10 @@ static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, co */ static int get_build_id(const int fd, const struct stat *fd_status, unsigned char **build_id) { - char buf[SELFMAG + 1]; - void *start_addr; + char *start_addr; size_t mapped_size; int ret = -1; - if (read(fd, buf, SELFMAG + 1) != SELFMAG + 1) - return -1; - - /* - * The first 4 bytes contain a magic number identifying the file as an - * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and - * ‘F’, respectively. These characters are together defined as ELFMAG. - */ - if (strncmp(buf, ELFMAG, SELFMAG)) - return -1; - /* * If the build-id exists, then it will most likely be present in the * beginning of the file. Therefore at most only the first 1 MB of the @@ -1695,16 +1661,25 @@ static int get_build_id(const int fd, const struct stat *fd_status, unsigned cha */ mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); - if (start_addr == MAP_FAILED) { + if ((void*)start_addr == MAP_FAILED) { pr_warn("Couldn't mmap file with fd %d\n", fd); return -1; } - if (buf[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(start_addr, build_id, fd, mapped_size); - if (buf[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(start_addr, build_id, fd, mapped_size); + /* + * The first 4 bytes contain a magic number identifying the file as an + * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and + * ‘F’, respectively. These characters are together defined as ELFMAG. + */ + if (memcmp(start_addr, ELFMAG, SELFMAG)) + goto out; + if (start_addr[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32((Elf32_Ehdr *)start_addr, build_id, fd, mapped_size); + if (start_addr[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64((Elf64_Ehdr *)start_addr, build_id, fd, mapped_size); + +out: munmap(start_addr, mapped_size); return ret; } @@ -1818,7 +1793,8 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) } if (!skip_for_shell_job && mnt_is_overmounted(mi)) { - pr_err("Open files on overmounted mounts are not supported yet\n"); + pr_err("Open files on overmounted mounts are not supported yet; mount=%d fd=%d path=%s\n", + p->mnt_id, p->fd, link->name + 1); return -1; } @@ -2527,9 +2503,10 @@ static int open_filemap(int pid, struct vma_area *vma) * using dup because dup returns a reference to the same struct file inside kernel, but we * cannot open a new FD. */ - ret = dup(plugin_fd); + ret = plugin_fd; } else if (vma->e->status & VMA_AREA_MEMFD) { - ret = memfd_open(vma->vmfd, &flags); + if (!inherited_fd(vma->vmfd, &ret)) + ret = memfd_open(vma->vmfd, &flags, true); } else { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); } diff --git a/criu/files.c b/criu/files.c index 3b653e24be..a57fb860fb 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -554,6 +557,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1800,5 +1811,11 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); + + if (init_dead_pidfd_hash()) { + pr_err("Could not initialise hash map for dead pidfds\n"); + return -1; + } + return collect_image(&files_cinfo); } diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c0986..2d87c73815 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/image.c b/criu/image.c index 9fb390ab7e..9589167fb1 100644 --- a/criu/image.c +++ b/criu/image.c @@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +struct inventory_plugin { + struct list_head node; + char *name; +}; + +struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list); +static int n_inventory_plugins; + int check_img_inventory(bool restore) { int ret = -1; @@ -99,6 +107,19 @@ int check_img_inventory(bool restore) } else { opts.network_lock_method = he->network_lock_method; } + + if (!he->plugins_entry) { + /* backwards compatibility: if the 'plugins_entry' field is missing, + * all plugins should be enabled during restore. + */ + n_inventory_plugins = -1; + } else { + PluginsEntry *pe = he->plugins_entry; + for (int i = 0; i < pe->n_plugins; i++) { + if (add_inventory_plugin(pe->plugins[i])) + goto out_err; + } + } } ret = 0; @@ -110,8 +131,92 @@ int check_img_inventory(bool restore) return ret; } +/** + * Check if the 'plugins' field in the inventory image contains + * the specified plugin name. If found, the plugin is removed + * from the linked list. + */ +bool check_and_remove_inventory_plugin(const char *name, size_t n) +{ + if (n_inventory_plugins == -1) + return true; /* backwards compatibility */ + + if (n_inventory_plugins > 0) { + struct inventory_plugin *p, *tmp; + + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + if (!strncmp(name, p->name, n)) { + xfree(p->name); + list_del(&p->node); + xfree(p); + n_inventory_plugins--; + return true; + } + } + } + + return false; +} + +/** + * We expect during restore all loaded plugins to be removed from + * the inventory_plugins_list. If the list is not empty, show an + * error message for each missing plugin. + */ +int check_inventory_plugins(void) +{ + struct inventory_plugin *p; + + if (n_inventory_plugins <= 0) + return 0; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pr_err("Missing required plugin: %s\n", p->name); + } + + return -1; +} + +/** + * Add plugin name to the inventory image. These values + * can be used to identify required plugins during restore. + */ +int add_inventory_plugin(const char *name) +{ + struct inventory_plugin *p; + + p = xmalloc(sizeof(struct inventory_plugin)); + if (p == NULL) + return -1; + + p->name = xstrdup(name); + if (!p->name) { + xfree(p); + return -1; + } + list_add(&p->node, &inventory_plugins_list); + n_inventory_plugins++; + + return 0; +} + +void free_inventory_plugins_list(void) +{ + struct inventory_plugin *p, *tmp; + + if (!list_empty(&inventory_plugins_list)) { + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + xfree(p->name); + list_del(&p->node); + xfree(p); + } + } + n_inventory_plugins = 0; +} + int write_img_inventory(InventoryEntry *he) { + PluginsEntry pe = PLUGINS_ENTRY__INIT; struct cr_img *img; int ret; @@ -121,8 +226,27 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; + if (!list_empty(&inventory_plugins_list)) { + struct inventory_plugin *p; + int i = 0; + + pe.n_plugins = n_inventory_plugins; + pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *)); + if (!pe.plugins) + return -1; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pe.plugins[i] = p->name; + i++; + } + } + he->plugins_entry = &pe; + ret = pb_write_one(img, he, PB_INVENTORY); + free_inventory_plugins_list(); + xfree(pe.plugins); + xfree(he->root_ids); close_image(img); if (ret < 0) diff --git a/criu/img-streamer.c b/criu/img-streamer.c index 7e36eae012..305e6fae5e 100644 --- a/criu/img-streamer.c +++ b/criu/img-streamer.c @@ -12,6 +12,7 @@ #include "rst-malloc.h" #include "common/scm.h" #include "common/lock.h" +#include "action-scripts.h" /* * We use different path names for the dump and restore sockets because: @@ -49,10 +50,17 @@ static const char *socket_name_for_mode(int mode) int img_streamer_init(const char *image_dir, int mode) { struct sockaddr_un addr; + int pre_stream_ret; int sockfd; img_streamer_mode = mode; + pre_stream_ret = run_scripts(ACT_PRE_STREAM); + if (pre_stream_ret != 0) { + pr_err("Pre-stream script failed with %d!\n", pre_stream_ret); + return -1; + } + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { pr_perror("Unable to instantiate UNIX socket"); diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index c2e8850aab..6a331a32f8 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -4,6 +4,7 @@ #include "asm/int.h" enum script_actions { + ACT_PRE_STREAM, ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, @@ -16,6 +17,7 @@ enum script_actions { ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, ACT_STATUS_READY, + ACT_QUERY_EXT_FILES, ACT_MAX }; @@ -24,6 +26,8 @@ extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); +extern int rpc_query_external_files(void); +extern int exec_rpc_query_external_files(char *name, int sk); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 93f61539cf..dc264032e8 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -9,7 +9,8 @@ struct parasite_dump_cgroup_args; extern u32 root_cg_set; int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); -int prepare_task_cgroup(struct pstree_item *); +int restore_task_cgroup(struct pstree_item *); +int prepare_cgroup_namespace(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index c7e98c756c..60cf9437e6 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -67,6 +67,7 @@ struct cg_root_opt { enum NETWORK_LOCK_METHOD { NETWORK_LOCK_IPTABLES, NETWORK_LOCK_NFTABLES, + NETWORK_LOCK_SKIP, }; #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES diff --git a/criu/include/criu-log.h b/criu/include/criu-log.h index ae2f38489c..9d52fbdb17 100644 --- a/criu/include/criu-log.h +++ b/criu/include/criu-log.h @@ -26,7 +26,6 @@ extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(pid_t pid); -extern void log_closedir(void); extern int log_keep_err(void); extern char *log_first_err(void); diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 886832eaaa..392ea9f534 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -56,6 +56,10 @@ enum { CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, + CR_PLUGIN_HOOK__PAUSE_DEVICES = 10, + + CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__MAX }; @@ -72,6 +76,8 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 69d670be93..59adf05b9e 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -19,6 +19,10 @@ enum faults { FI_HUGE_ANON_SHMEM_ID = 132, FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, + FI_DONT_USE_PAGEMAP_SCAN = 135, + FI_DUMP_CRASH = 136, + FI_DISABLE_FREEZE_CGROUP = 137, + FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ad34f48915..ffc0455d5f 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -57,4 +57,8 @@ #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be645..79e1ac1113 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,6 +113,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, diff --git a/criu/include/image.h b/criu/include/image.h index 5cb01bde20..afa7d5e12f 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -35,13 +35,15 @@ * - stack * the memory area is used in application stack so we * should be careful about guard page here + * - shadow stack + * the memory area is used by shadow stack * - vsyscall * special memory area injected into the task memory * space by the kernel itself, represent virtual syscall * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar - * the vDSO area, it might reqire additional memory + * the vDSO area, it might require additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap @@ -84,6 +86,7 @@ #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_SHSTK (1 << 15) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) @@ -174,4 +177,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); +extern int add_inventory_plugin(const char *name); +extern int check_inventory_plugins(void); +extern bool check_and_remove_inventory_plugin(const char *name, size_t n); + #endif /* __CR_IMAGE_H__ */ diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 0b2f715f38..e03a573419 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -85,6 +85,10 @@ struct kerndat_s { bool has_ptrace_get_rseq_conf; struct __ptrace_rseq_configuration libc_rseq_conf; bool has_ipv6_freebind; + bool has_membarrier_get_registrations; + bool has_pagemap_scan; + bool has_shstk; + bool has_close_range; }; extern struct kerndat_s kdat; diff --git a/criu/include/log.h b/criu/include/log.h index 85e6dc2e72..cbed330076 100644 --- a/criu/include/log.h +++ b/criu/include/log.h @@ -60,6 +60,8 @@ void flush_early_log_buffer(int fd); #define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) +#define pr_pwarn(fmt, ...) pr_warn(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) + #endif /* CR_NOGLIBC */ #endif /* __CR_LOG_H__ */ diff --git a/criu/include/magic.h b/criu/include/magic.h index 22d7218e45..6f0aff26d8 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -29,7 +29,7 @@ /* * The magic-s below correspond to coordinates - * of various Russian towns in the NNNNEEEE form. + * of various towns in the NNNNEEEE form. */ #define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ @@ -100,6 +100,7 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/mem.h b/criu/include/mem.h index 03574ea3d7..3618c9cc3b 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -7,6 +7,7 @@ #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" +#include "pagemap-cache.h" struct parasite_ctl; struct vm_area_list; @@ -47,5 +48,6 @@ int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -bool should_dump_page(VmaEntry *vmae, u64 pme); + +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); #endif /* __CR_MEM_H__ */ diff --git a/criu/include/memfd.h b/criu/include/memfd.h index 1b1dc79bbc..78d8100198 100644 --- a/criu/include/memfd.h +++ b/criu/include/memfd.h @@ -1,7 +1,9 @@ #ifndef __CR_MEMFD_H__ #define __CR_MEMFD_H__ +#include #include + #include "int.h" #include "common/config.h" @@ -12,7 +14,7 @@ extern int is_memfd(dev_t dev); extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); extern const struct fdtype_ops memfd_dump_ops; -extern int memfd_open(struct file_desc *d, u32 *fdflags); +extern int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap); extern struct collect_image_info memfd_cinfo; extern struct file_desc *collect_memfd(u32 id); extern int apply_memfd_seals(void); diff --git a/criu/include/net.h b/criu/include/net.h index 0da4cad137..5e8a848620 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -50,7 +50,6 @@ extern int kerndat_has_newifindex(void); extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); extern struct ns_id *net_get_root_ns(void); -extern int kerndat_nsid(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); extern struct ns_id *get_root_netns(void); diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index 1d8bbffaf6..875e69e560 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -1,10 +1,12 @@ #ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ +#include #include #include "int.h" #include "common/list.h" +#include "pagemap_scan.h" struct vma_area; @@ -15,9 +17,15 @@ typedef struct { unsigned long start; /* start of area */ unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ + int fd; /* file to read PMs from */ + u64 *map; /* local buffer */ size_t map_len; /* length of a buffer */ - int fd; /* file to read PMs from */ + + struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */ + size_t regs_len; /* actual length of regs */ + size_t regs_max_len; /* maximum length of regs */ + size_t regs_idx; /* current index in the regs array */ } pmc_t; #define PMC_INIT \ @@ -26,7 +34,8 @@ typedef struct { } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); -extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); +extern int pmc_fill(pmc_t *pmc, u64 start, u64 end); #endif /* __CR_PAGEMAP_H__ */ diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h new file mode 100644 index 0000000000..0ad4c9bc0b --- /dev/null +++ b/criu/include/pagemap_scan.h @@ -0,0 +1,68 @@ +#ifndef __CR_PAGEMAP_SCAN_H__ +#define __CR_PAGEMAP_SCAN_H__ + +#ifndef PAGEMAP_SCAN +#include +#include "int.h" + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + u64 start; + u64 end; + u64 categories; +}; + +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + u64 size; + u64 flags; + u64 start; + u64 end; + u64 walk_end; + u64 vec; + u64 vec_len; + u64 max_pages; + u64 category_inverted; + u64 category_mask; + u64 category_anyof_mask; + u64 return_mask; +}; +#endif /* PAGEMAP_SCAN */ + +#endif /* __CR_PAGEMAP_SCAN_H__ */ diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index 4540e11ee7..4a8ec2fee6 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -21,13 +21,6 @@ struct rt_sigframe; struct parasite_ctl; struct parasite_thread_ctl; -extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); -extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); - -struct proc_posix_timers_stat; -extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *); - extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core); diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 787c927be9..1244220f67 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -118,6 +118,8 @@ static inline int posix_timers_dump_size(int timer_n) */ struct parasite_dump_misc { + bool has_membarrier_get_registrations; /* this is sent from criu to parasite. */ + unsigned long brk; u32 pid; @@ -128,6 +130,7 @@ struct parasite_dump_misc { int dumpable; int thp_disabled; int child_subreaper; + int membarrier_registration_mask; }; /* @@ -148,6 +151,7 @@ struct parasite_dump_creds { int uids[4]; int gids[4]; + int no_new_privs; unsigned int secbits; unsigned int ngroups; /* diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 0000000000..4d2d71700e --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern int init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/prctl.h b/criu/include/prctl.h index c843f40a75..4c2a548b16 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -30,6 +30,12 @@ #ifndef PR_SET_DUMPABLE #define PR_SET_DUMPABLE 4 #endif +#ifndef PR_GET_NO_NEW_PRIVS +#define PR_GET_NO_NEW_PRIVS 39 +#endif +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif #ifndef PR_SET_MM #define PR_SET_MM 35 diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101f..c4241be557 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/include/restore.h b/criu/include/restore.h index 8ef0dbddf8..04d0065051 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -7,4 +7,35 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); +struct task_restore_args; +struct pstree_item; + +#ifndef arch_shstk_prepare +static inline int arch_shstk_prepare(struct pstree_item *item, + CoreEntry *core, + struct task_restore_args *ta) +{ + return 0; +} +#define arch_shstk_prepare arch_shstk_prepare +#endif + +#ifndef arch_shstk_unlock +static inline int arch_shstk_unlock(struct pstree_item *item, + CoreEntry *core, pid_t pid) +{ + return 0; +} +#define arch_shstk_unlock arch_shstk_unlock +#endif + +#ifndef arch_shstk_trampoline +static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + return func(arg); +} +#define arch_shstk_trampoline arch_shstk_trampoline +#endif + #endif diff --git a/criu/include/restorer.h b/criu/include/restorer.h index bc0beb5cbb..3fb5322a4b 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -56,6 +56,10 @@ struct restore_posix_timer { int overrun; }; +#ifndef rst_shstk_info +struct rst_shstk_info {}; +#endif + /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things @@ -72,7 +76,6 @@ struct thread_creds_args { u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; - unsigned int secbits; char *lsm_profile; unsigned int *groups; char *lsm_sockcreate; @@ -120,6 +123,8 @@ struct thread_restore_args { unsigned int seccomp_filters_n; bool seccomp_force_tsync; + struct rst_shstk_info shstk; + char comm[TASK_COMM_LEN]; int cg_set; int cgroupd_sk; @@ -144,7 +149,7 @@ struct task_restore_args { struct timeval logstart; int uffd; - bool has_thp_enabled; + bool thp_disabled; /* threads restoration */ int nr_threads; /* number of threads */ @@ -230,6 +235,7 @@ struct task_restore_args { #endif int lsm_type; int child_subreaper; + int membarrier_registration_mask; bool has_clone3_set_tid; /* @@ -240,6 +246,8 @@ struct task_restore_args { uid_t uid; u32 cap_eff[CR_CAP_SIZE]; + + struct rst_shstk_info shstk; } __aligned(64); /* @@ -331,4 +339,20 @@ enum { #define __r_sym(name) restorer_sym##name #define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name)) +#ifndef arch_shstk_switch_to_restorer +static inline int arch_shstk_switch_to_restorer(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_switch_to_restorer arch_shstk_switch_to_restorer +#endif + +#ifndef arch_shstk_restore +static inline int arch_shstk_restore(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_restore arch_shstk_restore +#endif + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index d0a3db6c5d..59b891fa26 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -73,10 +73,11 @@ struct rst_info { */ bool has_old_seccomp_filter; - bool has_thp_enabled; - struct rst_rseq *rseqe; + futex_t shstk_enable; + futex_t shstk_unlock; + void *breakpoint; }; diff --git a/criu/include/seize.h b/criu/include/seize.h index 4545bf2627..f5ea76b16c 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,11 +2,13 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); extern char *task_comm_info(pid_t pid, char *comm, size_t size); extern char *__task_comm_info(pid_t pid); +extern void dont_use_freeze_cgroup(void); #endif diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 813ef630ef..15cab11464 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -4,13 +4,14 @@ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" +#include "pagemap-cache.h" struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); -extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc); extern int fixup_sysv_shmems(void); extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); diff --git a/criu/include/sigact.h b/criu/include/sigact.h new file mode 100644 index 0000000000..4df011f961 --- /dev/null +++ b/criu/include/sigact.h @@ -0,0 +1,14 @@ +#ifndef __CR_SIGACT_H__ +#define __CR_SIGACT_H__ + +#include "images/core.pb-c.h" + +extern rt_sigaction_t sigchld_act; + +struct parasite_ctl; +struct pstree_item; + +extern int prepare_sigactions(CoreEntry *core); +extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); + +#endif diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 961d711ee7..69ee8589e6 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -69,6 +69,7 @@ extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC #define setsockopt sys_setsockopt +#define pr_perror(fmt, ...) pr_err(fmt ": errno %d\n", ##__VA_ARGS__, -ret) #endif static inline void tcp_repair_off(int fd) { @@ -76,7 +77,7 @@ static inline void tcp_repair_off(int fd) ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) - pr_err("Failed to turn off repair mode on socket\n"); + pr_perror("Failed to turn off repair mode on socket %d", fd); } extern void tcp_locked_conn_add(struct inet_sk_info *); @@ -86,6 +87,9 @@ extern void cpt_unlock_tcp_connections(void); extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si); +extern int dump_tcp_opts(int sk, TcpOptsEntry *toe); +extern int restore_tcp_opts(int sk, TcpOptsEntry *toe); + #define SK_EST_PARAM "tcp-established" #define SK_INFLIGHT_PARAM "skip-in-flight" #define SK_CLOSE_PARAM "tcp-close" diff --git a/criu/include/syscall.h b/criu/include/syscall.h deleted file mode 100644 index c38d6d971b..0000000000 --- a/criu/include/syscall.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __CR_SYSCALL_H__ -#define __CR_SYSCALL_H__ - -static inline int sys_fsopen(const char *fsname, unsigned int flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} -static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) -{ - return syscall(__NR_fsconfig, fd, cmd, key, value, aux); -} -static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) -{ - return syscall(__NR_fsmount, fd, flags, attr_flags); -} - -#endif /* __CR_SYSCALL_H__ */ \ No newline at end of file diff --git a/criu/include/timer.h b/criu/include/timer.h new file mode 100644 index 0000000000..d1deb6051d --- /dev/null +++ b/criu/include/timer.h @@ -0,0 +1,17 @@ +#ifndef __CR_TIMER_H__ +#define __CR_TIMER_H__ + +#include "images/core.pb-c.h" + +struct task_restore_args; +struct pstree_item; +struct parasite_ctl; +struct proc_posix_timers_stat; + +extern int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); +extern int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); + +extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item); +extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item); +#endif diff --git a/criu/include/util.h b/criu/include/util.h index 4b4dfda950..ae293a68c8 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -170,6 +170,7 @@ extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); extern int cr_daemon(int nochdir, int noclose, int close_fd); extern int status_ready(void); extern int is_root_user(void); +extern int close_fds(int minfd); extern int set_proc_self_fd(int fd); @@ -263,6 +264,10 @@ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid); +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode); +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags); + int fd_has_data(int lfd); int make_yard(char *path); @@ -274,8 +279,6 @@ static inline int sk_wait_data(int sk) } void fd_set_nonblocking(int fd, bool on); -void tcp_nodelay(int sk, bool on); -void tcp_cork(int sk, bool on); const char *ns_to_string(unsigned int ns); @@ -384,6 +387,11 @@ static inline void print_stack_trace(pid_t pid) extern int mount_detached_fs(const char *fsname); +extern int cr_fsopen(const char *fsname, unsigned int flags); +extern int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux); +extern int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags); +extern void fsfd_dump_messages(int fd); + extern char *get_legacy_iptables_bin(bool ipv6, bool restore); extern int set_opts_cap_eff(void); @@ -409,4 +417,6 @@ extern void util_init(void); extern char *resolve_mountpoint(char *path); +extern int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/include/vma.h b/criu/include/vma.h index 106c56af26..b8ddfc1422 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -106,6 +106,7 @@ static inline bool vma_entry_is_private(VmaEntry *entry, unsigned long task_size return (vma_entry_is(entry, VMA_AREA_REGULAR) && (vma_entry_is(entry, VMA_ANON_PRIVATE) || vma_entry_is(entry, VMA_FILE_PRIVATE)) && (entry->end <= task_size)) || + vma_entry_is(entry, VMA_AREA_SHSTK) || vma_entry_is(entry, VMA_AREA_AIORING); } @@ -122,8 +123,8 @@ static inline struct vma_area *vma_next(struct vma_area *vma) static inline bool vma_entry_can_be_lazy(VmaEntry *e) { return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && - !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && - !(e->flags & MAP_HUGETLB)); + !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VVAR)) && + !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && !(e->flags & MAP_HUGETLB)); } #endif /* __CR_VMA_H__ */ diff --git a/criu/irmap.c b/criu/irmap.c index 7b9d77bc1f..d2c5d588a2 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -67,6 +67,7 @@ static struct irmap hints[] = { .path = "/var/log", .nr_kids = -1, }, + { .path = "/usr/share/dbus-1/services", .nr_kids = -1 }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, @@ -101,7 +102,7 @@ static int irmap_update_stat(struct irmap *i) pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { - pr_perror("Can't stat %s", i->path); + pr_pwarn("Can't stat %s", i->path); return -1; } @@ -136,7 +137,7 @@ static int irmap_update_dir(struct irmap *t) pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { - pr_perror("Can't open %s", t->path); + pr_pwarn("Can't open %s", t->path); return -1; } @@ -499,8 +500,13 @@ int irmap_scan_path_add(char *path) return -1; } - o->ir->path = path; + o->ir->path = xstrdup(path); + if (!o->ir->path) { + xfree(o->ir); + xfree(o); + return -1; + } o->ir->nr_kids = -1; - list_add(&o->node, &opts.irmap_scan_paths); + list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; } diff --git a/criu/kerndat.c b/criu/kerndat.c index bc0c7ba05d..fa1ed21fad 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -12,11 +12,12 @@ #include #include #include -#include /* for sockaddr_in and inet_ntoa() */ +#include #include #include #include #include +#include #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include @@ -53,13 +54,23 @@ #include "memfd.h" #include "mount-v2.h" #include "util-caps.h" +#include "pagemap_scan.h" struct kerndat_s kdat = {}; +volatile int dummy_var; static int check_pagemap(void) { - int ret, fd; + int ret, fd, retry; u64 pfn = 0; + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); if (fd < 0) { @@ -72,11 +83,40 @@ static int check_pagemap(void) return -1; } - /* Get the PFN of some present page. Stack is here, so try it :) */ - ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); - if (ret != sizeof(pfn)) { - pr_perror("Can't read pagemap"); - return -1; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { + pr_debug("PAGEMAP_SCAN is supported\n"); + kdat.has_pagemap_scan = true; + } else { + switch (errno) { + case EINVAL: + case ENOTTY: + pr_debug("PAGEMAP_SCAN isn't supported\n"); + break; + default: + pr_perror("PAGEMAP_SCAN failed with unexpected errno"); + return -1; + } + } + + retry = 3; + while (retry--) { + ++dummy_var; + /* Get the PFN of a page likely to be present. */ + ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); + if (ret != sizeof(pfn)) { + pr_perror("Can't read pagemap"); + close(fd); + return -1; + } + /* The page can be swapped out by the time the read occurs, + * in which case the rest of the bits are a swap type + offset + * (which could be zero even if not hidden). + * Retry if this happens. */ + if (pfn & PME_PRESENT) + break; + pr_warn("got non-present PFN %#lx for the dummy data page; %s\n", (unsigned long)pfn, + retry ? "retrying" : "giving up"); + pfn = 0; } close(fd); @@ -465,8 +505,15 @@ static int get_last_cap(void) struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; + int ret; + + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + if (ret || kdat.last_cap < 32 * CR_CAP_SIZE) + return ret; - return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + pr_err("Kernel reports more capabilities than this CRIU supports: %u > %u\n", + kdat.last_cap, 32 * CR_CAP_SIZE - 1); + return -1; } static bool kerndat_has_memfd_create(void) @@ -615,29 +662,52 @@ static int kerndat_iptables_has_xtlocks(void) return 0; } -int kerndat_tcp_repair(void) +/* + * Unfortunately in C htonl() is not constexpr and cannot be used in a static + * initialization below. + */ +#define constant_htonl(x) \ + (__BYTE_ORDER == __BIG_ENDIAN ? (x) : \ + (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) + +static int kerndat_tcp_repair(void) { + static const struct sockaddr_in loopback_ip4 = { + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { constant_htonl(INADDR_LOOPBACK) }, + }; + static const struct sockaddr_in6 loopback_ip6 = { + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; int sock, clnt = -1, yes = 1, exit_code = -1; - struct sockaddr_in addr; - socklen_t aux; + const struct sockaddr *addr; + struct sockaddr_storage listener_addr; + socklen_t addrlen; - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); - addr.sin_port = 0; + addr = (const struct sockaddr *)&loopback_ip4; + addrlen = sizeof(loopback_ip4); sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) { + addr = (const struct sockaddr *)&loopback_ip6; + addrlen = sizeof(loopback_ip6); + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); + } if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } - if (bind(sock, (struct sockaddr *)&addr, sizeof(addr))) { + if (bind(sock, addr, addrlen)) { pr_perror("Unable to bind a socket"); goto err; } - aux = sizeof(addr); - if (getsockname(sock, (struct sockaddr *)&addr, &aux)) { + addrlen = sizeof(listener_addr); + if (getsockname(sock, (struct sockaddr *)&listener_addr, &addrlen)) { pr_perror("Unable to get a socket name"); goto err; } @@ -647,13 +717,13 @@ int kerndat_tcp_repair(void) goto err; } - clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + clnt = socket(addr->sa_family, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } - if (connect(clnt, (struct sockaddr *)&addr, sizeof(addr))) { + if (connect(clnt, (const struct sockaddr *)&listener_addr, addrlen)) { pr_perror("Unable to connect a socket"); goto err; } @@ -680,20 +750,22 @@ int kerndat_tcp_repair(void) return exit_code; } -int kerndat_nsid(void) +static int kerndat_nsid(void) { int nsid, sk; + kdat.has_nsid = false; + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { - pr_perror("Unable to create a netlink socket"); - return -1; + pr_pwarn("Unable to create a netlink socket: NSID can't be used."); + return 0; } if (net_get_nsid(sk, getpid(), &nsid) < 0) { - pr_err("NSID is not supported\n"); + pr_warn("NSID is not supported\n"); close(sk); - return -1; + return 0; } kdat.has_nsid = true; @@ -762,7 +834,7 @@ static int kerndat_detect_stack_guard_gap(void) * (see kernel commit 1be7107fbe18ee). * * Same time there was semi-complete - * patch released which hitted a number + * patch released which hit a number * of repos (Ubuntu, Fedora) where instead * of PAGE_SIZE the 1M gap is cut off. */ @@ -977,6 +1049,8 @@ int kerndat_sockopt_buf_lock(void) int sock; sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { pr_perror("Unable to create a socket"); return -1; @@ -1077,6 +1151,24 @@ static int kerndat_has_openat2(void) return 0; } +int __attribute__((weak)) kdat_has_shstk(void) +{ + return 0; +} + +static int kerndat_has_shstk(void) +{ + int ret = kdat_has_shstk(); + + if (ret < 0) { + pr_err("kdat_has_shstk failed\n"); + return ret; + } + + kdat.has_shstk = !!ret; + return 0; +} + #define KERNDAT_CACHE_NAME "criu.kdat" #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME @@ -1324,6 +1416,8 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); + if (!kdat.has_thp_disable) + pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE\n"); break; } } @@ -1367,17 +1461,20 @@ static bool kerndat_has_clone3_set_tid(void) */ pid = syscall(__NR_clone3, &args, sizeof(args)); - if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) { - kdat.has_clone3_set_tid = false; - return 0; - } - if (pid == -1 && errno == EINVAL) { - kdat.has_clone3_set_tid = true; - } else { - pr_perror("Unexpected error from clone3"); + if (pid != -1) { + pr_err("Unexpected success: clone3() returned %d\n", pid); return -1; } + if (errno == ENOSYS || errno == E2BIG) + return 0; + + if (errno != EINVAL) { + pr_pwarn("Unexpected error from clone3"); + return 0; + } + + kdat.has_clone3_set_tid = true; return 0; } @@ -1505,7 +1602,9 @@ static int __has_nftables_concat(void *arg) return 1; if (NFT_RUN_CMD(nft, "create table inet CRIU")) { - pr_err("Can't create nftables table\n"); + pr_warn("Can't create nftables table\n"); + *has = false; /* kdat.has_nftables_concat = false */ + ret = 0; goto nft_ctx_free_out; } @@ -1545,9 +1644,31 @@ static int kerndat_has_nftables_concat(void) #define IPV6_FREEBIND 78 #endif +static int __kerndat_has_ipv6_freebind(int sk) +{ + int val = 1; + + if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { + if (errno == ENOPROTOOPT) { + kdat.has_ipv6_freebind = false; + return 0; + } + pr_perror("Unable to setsockopt ipv6_freebind"); + return -1; + } + + kdat.has_ipv6_freebind = true; + return 0; +} + static int kerndat_has_ipv6_freebind(void) { - int sk, val; + int sk, ret; + + if (!kdat.ipv6) { + kdat.has_ipv6_freebind = false; + return 0; + } sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (sk == -1) { @@ -1555,17 +1676,47 @@ static int kerndat_has_ipv6_freebind(void) return -1; } - val = 1; - if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { - if (errno == ENOPROTOOPT) { - kdat.has_ipv6_freebind = false; - return 0; + ret = __kerndat_has_ipv6_freebind(sk); + close(sk); + return ret; +} + +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int kerndat_has_membarrier_get_registrations(void) +{ + int ret = syscall(__NR_membarrier, 1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0); + if (ret < 0) { + if (errno != EINVAL) { + return ret; } - pr_perror("Unable to setsockopt ipv6_freebind"); + + kdat.has_membarrier_get_registrations = false; + } else { + kdat.has_membarrier_get_registrations = true; + } + + return 0; +} + +static int kerndat_has_close_range(void) +{ + /* fd is greater than max_fd, so close_range should return EINVAL. */ + if (cr_close_range(2, 1, 0) == 0) { + pr_err("close_range succeeded unexpectedly\n"); return -1; } - kdat.has_ipv6_freebind = true; + if (errno == ENOSYS) { + pr_debug("close_range isn't supported\n"); + return 0; + } + if (errno != EINVAL) { + pr_perror("close_range returned unexpected error code"); + return -1; + } + + kdat.has_close_range = true; return 0; } @@ -1595,6 +1746,12 @@ int kerndat_try_load_new(void) return ret; } + ret = kerndat_has_shstk(); + if (ret < 0) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + return ret; + } + /* New information is found, we need to save to the cache */ if (ret) kerndat_save_cache(); @@ -1812,6 +1969,18 @@ int kerndat_init(void) pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_membarrier_get_registrations()) { + pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_shstk()) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_close_range()) { + pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/libnetlink.c b/criu/libnetlink.c index f0304b0dbc..c7a84a44d3 100644 --- a/criu/libnetlink.c +++ b/criu/libnetlink.c @@ -214,8 +214,3 @@ int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], in return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } - -int32_t nla_get_s32(const struct nlattr *nla) -{ - return *(const int32_t *)nla_data(nla); -} diff --git a/criu/mem.c b/criu/mem.c index ab86a1f6d7..c9578ef441 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -99,7 +99,7 @@ static inline bool __page_in_parent(bool dirty) return opts.track_mem && opts.img_parent && !dirty; } -bool should_dump_page(VmaEntry *vmae, u64 pme) +static bool should_dump_entire_vma(VmaEntry *vmae) { /* * vDSO area must be always dumped because on restore @@ -107,30 +107,53 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; - /* - * In turn VVAR area is special and referenced from - * vDSO area by IP addressing (at least on x86) thus - * never ever dump its content but always use one provided - * by the kernel on restore, ie runtime VVAR area must - * be remapped into proper place.. - */ - if (vma_entry_is(vmae, VMA_AREA_VVAR)) - return false; - - /* - * Optimisation for private mapping pages, that haven't - * yet being COW-ed - */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) - return true; return false; } +/* + * should_dump_page returns vaddr if an addressed page has to be dumped. + * Otherwise, it returns an address that has to be inspected next. + */ +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +{ + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) + return -1; + + if (pmc->regs) { + while (1) { + if (pmc->regs_idx == pmc->regs_len) + return pmc->end; + if (vaddr < pmc->regs[pmc->regs_idx].end) + break; + pmc->regs_idx++; + } + if (vaddr < pmc->regs[pmc->regs_idx].start) + return pmc->regs[pmc->regs_idx].start; + if (softdirty) + *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + return vaddr; + } else { + u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) + return vaddr + PAGE_SIZE; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + if (softdirty) + *softdirty = pme & PME_SOFT_DIRTY; + return vaddr; + } + + return vaddr + PAGE_SIZE; + } +} + bool page_is_zero(u64 pme) { return __page_is_zero(pme); @@ -161,28 +184,33 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but - * the memory contents is present in the pagent image set. + * the memory contents is present in the parent image set. */ -static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr, bool has_parent) { - u64 *at = &map[PAGE_PFN(*off)]; - unsigned long pfn, nr_to_scan; + unsigned long nr_scanned; unsigned long pages[3] = {}; + unsigned long vaddr; + bool dump_all_pages; int ret = 0; - nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; + dump_all_pages = should_dump_entire_vma(vma->e); - for (pfn = 0; pfn < nr_to_scan; pfn++) { - unsigned long vaddr; + nr_scanned = 0; + for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; + bool softdirty = false; + u64 next; int st; - if (!should_dump_page(vma->e, at[pfn])) + /* If dump_all_pages is true, should_dump_page is called to get pme. */ + next = should_dump_page(pmc, vma->e, vaddr, &softdirty); + if (!dump_all_pages && next != vaddr) { + vaddr = next - PAGE_SIZE; continue; - - vaddr = vma->e->start + *off + pfn * PAGE_SIZE; + } if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; @@ -194,7 +222,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { + if (has_parent && page_in_parent(softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -214,9 +242,8 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct pages[st]++; } - *off += pfn * PAGE_SIZE; - - cnt_add(CNT_PAGES_SCANNED, nr_to_scan); + *pvaddr = vaddr; + cnt_add(CNT_PAGES_SCANNED, nr_scanned); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); @@ -356,12 +383,20 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { - u64 off = 0; - u64 *map; + u64 vaddr; int ret; if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) return 0; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vma->e, VMA_AREA_VVAR)) + return 0; /* * To facilitate any combination of pre-dump modes to run after @@ -421,15 +456,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - map = pmc_get_map(pmc, vma); - if (!map) + if (pmc_get_map(pmc, vma)) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) - return add_shmem_area(item->pid->real, vma->e, map); - + return add_shmem_area(item->pid->real, vma->e, pmc); + vaddr = vma->e->start; again: - ret = generate_iovs(item, vma, pp, map, &off, has_parent); + ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); @@ -707,6 +741,8 @@ int prepare_mm_pid(struct pstree_item *i) ri->vmas.rst_priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.rst_priv_size += PAGE_SIZE; + if (vma_area_is(vma, VMA_AREA_SHSTK)) + ri->vmas.rst_priv_size += PAGE_SIZE; } pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); @@ -848,6 +884,14 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void vma->e->start -= PAGE_SIZE; size = vma_entry_len(vma->e); + + /* + * map an extra page for shadow stack VMAs, it will be used as a + * temporary shadow stack + */ + if (vma_area_is(vma, VMA_AREA_SHSTK)) + size += PAGE_SIZE; + if (!vma_inherited(vma)) { int flag = 0; /* @@ -923,6 +967,15 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) { + /* + * Shadow stack VMAs cannot be mmap()ed, they must be created using + * map_shadow_stack() system call. + * Premap them to reserve virtual address space and populate them + * to have there contents available for later copying. + */ + if (vma_area_is(vma, VMA_AREA_SHSTK)) + return true; + /* * On kernels with 4K guard pages, growsdown VMAs * always have one guard page at the @@ -1004,7 +1057,7 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo do { if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) continue; - if (pr->pe->vaddr > vma->e->end) + if (pr->pe->vaddr >= vma->e->end) vma->e->status |= VMA_NO_PROT_WRITE; break; } while (pr->advance(pr)); @@ -1034,6 +1087,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned int nr_shared = 0; unsigned int nr_dropped = 0; unsigned int nr_compared = 0; + unsigned int nr_enqueued = 0; unsigned int nr_lazy = 0; unsigned long va; @@ -1109,7 +1163,8 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) len >>= PAGE_SHIFT; nr_restored += len; i += len - 1; - pr_debug("Enqueue page-read\n"); + + nr_enqueued++; continue; } @@ -1205,7 +1260,8 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) pr_info("nr_restored_pages: %d\n", nr_restored); pr_info("nr_shared_pages: %d\n", nr_shared); - pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_enqueued: %d\n", nr_enqueued); pr_info("nr_lazy: %d\n", nr_lazy); return 0; @@ -1217,8 +1273,6 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { - MmEntry *mm = rsti(t)->mm; - /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until @@ -1241,8 +1295,6 @@ static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) pr_perror("Cannot disable THP"); return -1; } - if (!(mm->has_thp_disabled && mm->thp_disabled)) - rsti(t)->has_thp_enabled = true; return 0; } diff --git a/criu/memfd.c b/criu/memfd.c index da29377034..9d9f0621fc 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -46,6 +46,7 @@ struct memfd_restore_inode { int fdstore_id; unsigned int pending_seals; MemfdInodeEntry *mie; + bool was_opened_rw; }; static LIST_HEAD(memfd_inodes); @@ -91,10 +92,21 @@ static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char * mie.has_hugetlb_flag = true; mie.hugetlb_flag = flag | MFD_HUGETLB; } + mie.mode = st->st_mode; + mie.has_mode = true; mie.seals = fcntl(fd, F_GET_SEALS); - if (mie.seals == -1) - goto out; + if (mie.seals == -1) { + if (errno != EINVAL || ~mie.hugetlb_flag & MFD_HUGETLB) { + pr_perror("fcntl(F_GET_SEALS)"); + goto out; + } + /* Kernels before 4.16 don't allow MFD_HUGETLB | + * MFD_ALLOW_SEALING and return EINVAL for + * fcntl(MFD_HUGETLB-enabled fd). + */ + mie.seals = F_SEAL_SEAL; + } if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_INODE), &mie, PB_MEMFD_INODE)) goto out; @@ -222,6 +234,7 @@ static int collect_one_memfd_inode(void *o, ProtobufCMessage *base, struct cr_im mutex_init(&inode->lock); inode->fdstore_id = -1; inode->pending_seals = 0; + inode->was_opened_rw = false; list_add_tail(&inode->list, &memfd_inodes); @@ -270,8 +283,13 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; - if (fchown(fd, mie->uid, mie->gid)) { - pr_perror("Can't change uid %d gid %d of memfd:%s", (int)mie->uid, (int)mie->gid, mie->name); + if (mie->has_mode) + ret = cr_fchperm(fd, mie->uid, mie->gid, mie->mode); + else + ret = cr_fchown(fd, mie->uid, mie->gid); + if (ret) { + pr_perror("Can't set permissions { uid %d gid %d mode %#o } of memfd:%s", (int)mie->uid, + (int)mie->gid, mie->has_mode ? (int)mie->mode : -1, mie->name); goto out; } @@ -305,7 +323,7 @@ static int memfd_open_inode(struct memfd_restore_inode *inode) return fd; } -int memfd_open(struct file_desc *d, u32 *fdflags) +int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) { struct memfd_info *mfi; MemfdFileEntry *mfe; @@ -315,57 +333,80 @@ int memfd_open(struct file_desc *d, u32 *fdflags) mfi = container_of(d, struct memfd_info, d); mfe = mfi->mfe; - if (inherited_fd(d, &fd)) - return fd; - pr_info("Restoring memfd id=%d\n", mfe->id); fd = memfd_open_inode(mfi->inode); if (fd < 0) - goto err; + return -1; /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; + + if (filemap && (flags & O_ACCMODE) == O_RDWR) + return fd; + + if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { + /* + * If there is only a single RW-opened fd for a memfd, it can + * be used to pass it to execveat() with AT_EMPTY_PATH to have + * its contents executed. This currently works only for the + * original fd from memfd_create() so return the original fd + * once -- in case the caller expects to be the sole opener + * and does execveat() from this memfd. + */ + if (!fcntl(fd, F_SETFL, flags)) { + mfi->inode->was_opened_rw = true; + return fd; + } + + pr_pwarn("Can't change fd flags to %#o for memfd id=%d", flags, mfe->id); + } + /* * Ideally we should call compat version open() to not force the * O_LARGEFILE file flag with regular open(). It doesn't seem that * important though. */ _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); - if (_fd < 0) { + if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); - goto err; - } + else if (!filemap && (flags & O_ACCMODE) == O_RDWR) + pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); + close(fd); - fd = _fd; + return _fd; +} + +static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) +{ + MemfdFileEntry *mfe; + int fd; + + if (inherited_fd(d, new_fd)) + return 0; + + fd = memfd_open(d, NULL, false); + if (fd < 0) + return -1; + + mfe = container_of(d, struct memfd_info, d)->mfe; if (restore_fown(fd, mfe->fown) < 0) goto err; if (lseek(fd, mfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file position of memfd id=%d", mfe->id); + pr_perror("Can't restore file position of %d for memfd id=%d", fd, mfe->id); goto err; } - return fd; + *new_fd = fd; + return 0; err: - if (fd >= 0) - close(fd); + close(fd); return -1; } -static int memfd_open_fe_fd(struct file_desc *fd, int *new_fd) -{ - int tmp; - - tmp = memfd_open(fd, NULL); - if (tmp < 0) - return -1; - *new_fd = tmp; - return 0; -} - static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) { MemfdInodeEntry *mie = NULL; diff --git a/criu/mount.c b/criu/mount.c index db9db63b27..82bbd52d6c 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -98,7 +98,7 @@ static char *ext_mount_lookup(char *key) int len = strlen(key); char mkey[len + 6]; - sprintf(mkey, "mnt[%s]", key); + snprintf(mkey, sizeof(mkey), "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; @@ -1197,8 +1197,8 @@ int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinf dev == pm->s_dev_rt) return 0; - pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, - pm->fstype->name, pm->ns_mountpoint); + pr_warn("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, + pm->fstype->name, pm->ns_mountpoint); return -1; } @@ -1239,12 +1239,16 @@ int __open_mountpoint(struct mount_info *pm) int open_mount(unsigned int s_dev) { struct mount_info *m; + int mnt_fd; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; - return __open_mountpoint(m); + mnt_fd = __open_mountpoint(m); + if (mnt_fd < 0) + pr_err("Can't open mount %#x\n", s_dev); + return mnt_fd; } /* Bind-mount a mount point in a temporary place without children */ @@ -2823,7 +2827,7 @@ static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { - struct mount_info *mi; /* child is remaped into the root yards */ + struct mount_info *mi; /* child is remapped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; diff --git a/criu/namespaces.c b/criu/namespaces.c index b1b5303fa5..b7c0ab4008 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1454,7 +1454,7 @@ int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) * each other easily. Stream socket require manual * messages boundaries. * - * b) Make callers note the damon death by seeing the + * b) Make callers note the daemon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. diff --git a/criu/net.c b/criu/net.c index 2793b18e66..eee3311087 100644 --- a/criu/net.c +++ b/criu/net.c @@ -111,15 +111,18 @@ int read_ns_sys_file(char *path, char *buf, int len) } rlen = read(fd, buf, len); + if (rlen == -1) + pr_perror("Can't read ns' %s", path); close(fd); if (rlen == len) { + buf[0] = '\0'; pr_err("Too small buffer to read ns sys file %s\n", path); return -1; } - if (rlen > 0) - buf[rlen - 1] = '\0'; + if (rlen >= 0) + buf[rlen] = '\0'; return rlen; } @@ -356,22 +359,23 @@ static int ipv6_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntr return net_conf_op(tgt, conf, n, op, "ipv6", req, path, ARRAY_SIZE(devconfs6), devconfs6, def_conf); } -static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) +static int unix_conf_op(SysctlEntry ***rconf, size_t *pn, int op) { int i, ret = -1, flags = 0; char path[ARRAY_SIZE(unix_conf_entries)][MAX_CONF_UNIX_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(unix_conf_entries)] = {}; SysctlEntry **conf = *rconf; + size_t n = *pn; - if (*n != ARRAY_SIZE(unix_conf_entries)) { - pr_err("unix: Unexpected entries in config (%zu %zu)\n", *n, ARRAY_SIZE(unix_conf_entries)); + if (n != ARRAY_SIZE(unix_conf_entries)) { + pr_err("unix: Unexpected entries in config (%zu %zu)\n", n, ARRAY_SIZE(unix_conf_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < *n; i++) { + for (i = 0; i < n; i++) { snprintf(path[i], MAX_CONF_UNIX_PATH, CONF_UNIX_FMT, unix_conf_entries[i]); req[i].name = path[i]; req[i].flags = flags; @@ -387,7 +391,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) } } - ret = sysctl_op(req, *n, op, CLONE_NEWNET); + ret = sysctl_op(req, n, op, CLONE_NEWNET); if (ret < 0) { pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", CONF_UNIX_BASE); return -1; @@ -396,7 +400,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) if (op == CTL_READ) { bool has_entries = false; - for (i = 0; i < *n; i++) { + for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { conf[i]->has_iarg = true; if (!has_entries) @@ -409,7 +413,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) * Unix conf is optional. */ if (!has_entries) { - *n = 0; + *pn = 0; *rconf = NULL; } } @@ -2435,27 +2439,39 @@ static inline int do_restore_nftables(struct cr_img *img) off_t img_data_size; char *buf; - if ((img_data_size = img_raw_size(img)) < 0) + if ((img_data_size = img_raw_size(img)) < 0) { + pr_err("image size mismatch\n"); goto out; + } - if (read_img_str(img, &buf, img_data_size) < 0) + if (read_img_str(img, &buf, img_data_size) < 0) { + pr_err("Failed to read nftables data\n"); goto out; + } nft = nft_ctx_new(NFT_CTX_DEFAULT); - if (!nft) + if (!nft) { + pr_err("Failed to create nft context object\n"); goto buf_free_out; + } + + if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft)) { + pr_err("Failed to enable std/err output buffering\n"); + goto nft_ctx_free_out; + } - if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft) || #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) - nft_run_cmd_from_buffer(nft, buf, strlen(buf))) + if (nft_run_cmd_from_buffer(nft, buf, strlen(buf))) #elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) - nft_run_cmd_from_buffer(nft, buf)) + if (nft_run_cmd_from_buffer(nft, buf)) #else - { - BUILD_BUG_ON(1); - } + BUILD_BUG_ON(1); #endif + { + pr_err("nft command error:\n%s\n%s\n", + nft_ctx_get_error_buffer(nft), buf); goto nft_ctx_free_out; + } exit_code = 0; @@ -3131,6 +3147,9 @@ int network_lock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; @@ -3172,19 +3191,53 @@ static inline int nftables_network_unlock(void) #endif } +static bool iptables_has_criu_jump_target(void) +{ + int fd, ret; + char *argv[4] = { "sh", "-c", "iptables -C INPUT -j CRIU", NULL }; + + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + fd = -1; + pr_perror("failed to open /dev/null, using log fd"); + } + + ret = cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL); + close_safe(&fd); + return !ret; +} + static int iptables_network_unlock_internal(void) { - char conf[] = "*filter\n" - ":CRIU - [0:0]\n" - "-D INPUT -j CRIU\n" - "-D OUTPUT -j CRIU\n" - "-X CRIU\n" - "COMMIT\n"; + char delete_jump_targets[] = "*filter\n" + ":CRIU - [0:0]\n" + "-D INPUT -j CRIU\n" + "-D OUTPUT -j CRIU\n" + "COMMIT\n"; + + char delete_criu_chain[] = "*filter\n" + ":CRIU - [0:0]\n" + "-X CRIU\n" + "COMMIT\n"; + int ret = 0; - ret |= iptables_restore(false, conf, sizeof(conf) - 1); + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); if (kdat.ipv6) - ret |= iptables_restore(true, conf, sizeof(conf) - 1); + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + + /* For compatibility with iptables-nft backend, we need to make sure that all jump + * targets have been removed before deleting the CRIU chain. + */ + if (iptables_has_criu_jump_target()) { + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + } + + ret |= iptables_restore(false, delete_criu_chain, sizeof(delete_criu_chain) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_criu_chain, sizeof(delete_criu_chain) - 1); return ret; } @@ -3193,6 +3246,9 @@ static int network_unlock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; @@ -3265,7 +3321,7 @@ int macvlan_ext_add(struct external *ext) /* * The setns() syscall (called by switch_ns()) can be extremely * slow. If we call it two or more times from the same task the - * kernel will synchonously go on a very slow routine called + * kernel will synchronously go on a very slow routine called * synchronize_rcu() trying to put a reference on old namespaces. * * To avoid doing this more than once we pre-create all the @@ -3433,7 +3489,7 @@ struct ns_id *net_get_root_ns(void) /* * socket_diag doesn't report unbound and unconnected sockets, - * so we have to get their network namesapces explicitly + * so we have to get their network namespaces explicitly */ struct ns_id *get_socket_ns(int lfd) { diff --git a/criu/netfilter.c b/criu/netfilter.c index 2212fd9f23..9e78dc4b03 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -48,8 +48,8 @@ void preload_netfilter_modules(void) fd = -1; pr_perror("failed to open /dev/null, using log fd for net module preload"); } - cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, 0); - cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, 0); + cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, CRS_CAN_FAIL); + cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, CRS_CAN_FAIL); close_safe(&fd); } diff --git a/criu/page-pipe.c b/criu/page-pipe.c index 54dc3ccc41..aab6742be7 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -99,6 +99,7 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl { struct page_pipe_buf *prev = pp_prev_ppb(pp, ppb_flags); struct page_pipe_buf *ppb; + int ppb_size = 0; ppb = xmalloc(sizeof(*ppb)); if (!ppb) @@ -120,7 +121,13 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl cnt_add(CNT_PAGE_PIPES, 1); ppb->pipe_off = 0; - ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE; + ppb_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0); + if (ppb_size < 0) { + xfree(ppb); + pr_perror("Can't get pipe size"); + return NULL; + } + ppb->pipe_size = ppb_size / PAGE_SIZE; pp->nr_pipes++; } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 782d4cafce..94f4774148 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -157,6 +158,20 @@ static inline int send_psi(int sk, struct page_server_iov *pi) return send_psi_flags(sk, pi, 0); } +static void tcp_cork(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_CORK=%d", val); +} + +static void tcp_nodelay(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_NODELAY=%d", val); +} + /* page-server xfer */ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long len) { diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 00f088ff3f..f04a517de3 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -1,5 +1,6 @@ #include #include +#include #include "page.h" #include "pagemap-cache.h" @@ -10,6 +11,7 @@ #include "vma.h" #include "mem.h" #include "kerndat.h" +#include "fault-injection.h" #undef LOG_PREFIX #define LOG_PREFIX "pagemap-cache: " @@ -22,6 +24,8 @@ #define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) +#define PAGE_REGIONS_MAX_NR 32768 + /* * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read * for a few vma-s for one read call, it returns incorrect data. @@ -50,10 +54,23 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->pid = pid; pmc->map_len = PAGEMAP_LEN(map_size); pmc->vma_head = vma_head; - - pmc->map = xmalloc(pmc->map_len); - if (!pmc->map) - goto err; + pmc->regs_max_len = PAGE_PFN(map_size); + if (pmc->regs_max_len > PAGE_REGIONS_MAX_NR) + pmc->regs_max_len = PAGE_REGIONS_MAX_NR; + pmc->regs_len = 0; + pmc->regs_idx = 0; + pmc->regs = NULL; + pmc->map = NULL; + + if (kdat.has_pagemap_scan && !fault_injected(FI_DONT_USE_PAGEMAP_SCAN)) { + pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); + if (!pmc->regs) + goto err; + } else { + pmc->map = xmalloc(pmc->map_len); + if (!pmc->map) + goto err; + } if (pagemap_cache_disabled) pr_warn_once("The pagemap cache is disabled\n"); @@ -87,17 +104,11 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz return -1; } -static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) -{ - return &pmc->map[PAGE_PFN(addr - pmc->start)]; -} - static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) { unsigned long low = vma->e->start & PMC_MASK; unsigned long high = low + PMC_SIZE; size_t len = vma_area_len(vma); - size_t size_map; if (high > kdat.task_size) high = kdat.task_size; @@ -115,7 +126,7 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) * fit in solid manner, iow -- either the whole vma fits * the cache window, either plain read is used. * - * The benefit (apart redusing the number of read() calls) + * The benefit (apart reducing the number of read() calls) * is to walk page tables less. */ if (!pagemap_cache_disabled && len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) { @@ -149,39 +160,86 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) pr_debug("\t%d: simple mode [l:%lx h:%lx]\n", pmc->pid, pmc->start, pmc->end); } + return pmc_fill(pmc, pmc->start, pmc->end); +} + +int pmc_fill(pmc_t *pmc, u64 start, u64 end) +{ + size_t size_map, off; + + pmc->start = start; + pmc->end = end; + size_map = PAGEMAP_LEN(pmc->end - pmc->start); BUG_ON(pmc->map_len < size_map); BUG_ON(pmc->fd < 0); - if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; + if (pmc->regs) { + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = pmc->start, + .end = pmc->end, + .vec = (long)pmc->regs, + .vec_len = pmc->regs_max_len, + .max_pages = 0, + /* + * Request pages that are in RAM or swap, excluding + * zero-filled and file-backed pages. + */ + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; + long ret; + + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); + if (ret == -1) { + pr_perror("PAGEMAP_SCAN"); + pmc_zap(pmc); + return -1; + } + pmc->regs_len = ret; + pmc->regs_idx = 0; + pmc->end = args.walk_end; + } else { + for (off = 0; off != size_map;) { + ssize_t ret; + char *ptr = (char *)pmc->map; + + ret = pread(pmc->fd, ptr + off, size_map - off, PAGEMAP_PFN_OFF(pmc->start) + off); + if (ret == -1) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } + off += ret; + } } return 0; } -u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) +int pmc_get_map(pmc_t *pmc, const struct vma_area *vma) { /* Hit */ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) - return __pmc_get_map(pmc, vma->e->start); + return 0; /* Miss, refill the cache */ if (pmc_fill_cache(pmc, vma)) { pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end); - return NULL; + return -1; } - - /* Hit for sure */ - return __pmc_get_map(pmc, vma->e->start); + return 0; } void pmc_fini(pmc_t *pmc) { close_safe(&pmc->fd); xfree(pmc->map); + xfree(pmc->regs); pmc_reset(pmc); } diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 35489634d9..a88f8a66f2 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -9,7 +9,6 @@ #include "common/compiler.h" #include "types.h" #include "protobuf.h" -#include "images/sa.pb-c.h" #include "images/timer.pb-c.h" #include "images/creds.pb-c.h" #include "images/core.pb-c.h" @@ -115,6 +114,10 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + if (c->no_new_privs > 0) { + ce->no_new_privs = c->no_new_privs; + ce->has_no_new_privs = true; + } ce->secbits = c->secbits; ce->n_groups = c->ngroups; @@ -224,211 +227,12 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit return dump_thread_core(pid, core, args); } -int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - TaskCoreEntry *tc = item->core[0]->tc; - struct parasite_dump_sa_args *args; - int ret, sig; - SaEntry *sa, **psa; - - args = compel_parasite_args(ctl, struct parasite_dump_sa_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); - if (ret < 0) - return ret; - - psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); - if (!psa) - return -1; - - sa = (SaEntry *)(psa + SIGMAX - 2); - - tc->n_sigactions = SIGMAX - 2; - tc->sigactions = psa; - - for (sig = 1; sig <= SIGMAX; sig++) { - int i = sig - 1; - - if (sig == SIGSTOP || sig == SIGKILL) - continue; - - sa_entry__init(sa); - ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); - ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); - ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); -#ifdef CONFIG_MIPS - sa->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); - memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); -#else - BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); -#endif - sa->has_compat_sigaction = true; - sa->compat_sigaction = !compel_mode_native(ctl); - - *(psa++) = sa++; - } - - return 0; -} - -static void encode_itimer(struct itimerval *v, ItimerEntry *ie) -{ - ie->isec = v->it_interval.tv_sec; - ie->iusec = v->it_interval.tv_usec; - ie->vsec = v->it_value.tv_sec; - ie->vusec = v->it_value.tv_usec; -} - -int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - struct parasite_dump_itimers_args *args; - int ret; - - args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); - if (ret < 0) - return ret; - - encode_itimer((&args->real), (core->tc->timers->real)); - encode_itimer((&args->virt), (core->tc->timers->virt)); - encode_itimer((&args->prof), (core->tc->timers->prof)); - - return 0; -} - -static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) -{ - int sz; - - /* - * Will be free()-ed in core_entry_free() - */ - - sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); - tte->posix = xmalloc(sz); - if (!tte->posix) - return -1; - - tte->n_posix = n; - *pte = (PosixTimerEntry *)(tte->posix + n); - return 0; -} - -static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) -{ - pid_t vtid = 0; - int i; - - if (rtid == 0) - return 0; - - if (!(root_ns_mask & CLONE_NEWPID)) { - /* Non-pid-namespace case */ - pte->notify_thread_id = rtid; - pte->has_notify_thread_id = true; - return 0; - } - - /* Pid-namespace case */ - if (!kdat.has_nspid) { - pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); - return -1; - } - - for (i = 0; i < item->nr_threads; i++) { - if (item->threads[i].real != rtid) - continue; - - vtid = item->threads[i].ns[0].virt; - break; - } - - if (vtid == 0) { - pr_err("Unable to convert the notify thread id %d\n", rtid); - return -1; - } - - pte->notify_thread_id = vtid; - pte->has_notify_thread_id = true; - return 0; -} - -static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, - PosixTimerEntry *pte) -{ - pte->it_id = vp->spt.it_id; - pte->clock_id = vp->spt.clock_id; - pte->si_signo = vp->spt.si_signo; - pte->it_sigev_notify = vp->spt.it_sigev_notify; - pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); - - pte->overrun = v->overrun; - - pte->isec = v->val.it_interval.tv_sec; - pte->insec = v->val.it_interval.tv_nsec; - pte->vsec = v->val.it_value.tv_sec; - pte->vnsec = v->val.it_value.tv_nsec; - - if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) - return -1; - - return 0; -} - -int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - TaskTimersEntry *tte = core->tc->timers; - PosixTimerEntry *pte; - struct proc_posix_timer *temp; - struct parasite_dump_posix_timers_args *args; - int ret, exit_code = -1; - int args_size; - int i; - - if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) - return -1; - - args_size = posix_timers_dump_size(proc_args->timer_n); - args = compel_parasite_args_s(ctl, args_size); - args->timer_n = proc_args->timer_n; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - args->timer[i].it_id = temp->spt.it_id; - i++; - } - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); - if (ret < 0) - goto end_posix; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - posix_timer_entry__init(&pte[i]); - if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) - goto end_posix; - tte->posix[i] = &pte[i]; - i++; - } - - exit_code = 0; -end_posix: - free_posix_timers(proc_args); - return exit_code; -} - int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) { struct parasite_dump_misc *ma; ma = compel_parasite_args(ctl, struct parasite_dump_misc); + ma->has_membarrier_get_registrations = kdat.has_membarrier_get_registrations; if (compel_rpc_call_sync(PARASITE_CMD_DUMP_MISC, ctl) < 0) return -1; diff --git a/criu/pidfd.c b/criu/pidfd.c new file mode 100644 index 0000000000..3ea3c93094 --- /dev/null +++ b/criu/pidfd.c @@ -0,0 +1,307 @@ +#include "common/lock.h" +#include "imgset.h" +#include "pidfd.h" +#include "fdinfo.h" +#include "pidfd.pb-c.h" +#include "protobuf.h" +#include "pstree.h" +#include +#include +#include +#include "common/bug.h" +#include "rst-malloc.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pidfd: " + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + PidfdEntry *pidfe; + struct file_desc d; +}; + +struct dead_pidfd { + unsigned int ino; + int pid; + size_t count; + mutex_t pidfd_lock; + struct hlist_node hash; +}; + +#define DEAD_PIDFD_HASH_SIZE 32 +static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; +static mutex_t *dead_pidfd_hash_lock; + +int init_dead_pidfd_hash(void) +{ + for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); + + dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); + if (!dead_pidfd_hash_lock) + return -1; + + mutex_init(dead_pidfd_hash_lock); + + return 0; +} + +static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) +{ + struct dead_pidfd *dead; + struct hlist_head *chain; + + mutex_lock(dead_pidfd_hash_lock); + chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; + hlist_for_each_entry(dead, chain, hash) { + if (dead->ino == ino) { + mutex_unlock(dead_pidfd_hash_lock); + return dead; + } + } + mutex_unlock(dead_pidfd_hash_lock); + + return NULL; +} + +int is_pidfd_link(char *link) +{ + /* + * pidfs was introduced in Linux 6.9 + * before which anonymous-inodes were used + */ + return is_anon_link_type(link, "[pidfd]"); +} + +static void pr_info_pidfd(char *action, PidfdEntry *pidfe) +{ + pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", + action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino + ); +} + +static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) +{ + struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) + return -1; + + if (p->flags & PIDFD_THREAD) { + pr_err("PIDFD_THREAD flag is currently not supported\n"); + return -1; + } + + /* + * Check if the pid pidfd refers to is part of process tree + * This ensures the process will exist on restore. + */ + if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { + pr_err("pidfd pid %d is not a part of process tree..\n", + pidfd_info.pid); + return -1; + } + + pidfd_info.pidfe.id = id; + pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); + pidfd_info.pidfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIDFD; + fe.id = pidfd_info.pidfe.id; + fe.pidfd = &pidfd_info.pidfe; + + pr_info_pidfd("Dumping", &pidfd_info.pidfe); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops pidfd_dump_ops = { + .type = FD_TYPES__PIDFD, + .dump = dump_one_pidfd, +}; + +static int pidfd_open(pid_t pid, int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int create_tmp_process(void) +{ + int tmp_process; + tmp_process = fork(); + if (tmp_process < 0) { + pr_perror("Could not fork"); + return -1; + } else if (tmp_process == 0) { + while(1) + sleep(1); + } + return tmp_process; +} + +static int free_dead_pidfd(struct dead_pidfd *dead) +{ + int status; + sigset_t blockmask, oldmask; + + /* + * Block SIGCHLD to prevent interfering from sigchld_handler() + * and to properly handle the tmp process termination without + * a race condition. A similar approach is used in cr_system(). + */ + sigemptyset(&oldmask); + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { + pr_perror("Cannot set mask of blocked signals"); + goto err; + } + + if (kill(dead->pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", + dead->pid); + goto err; + } + + if (waitpid(dead->pid, &status, 0) != dead->pid) { + pr_perror("Could not wait on temporary process with pid: %d", + dead->pid); + goto err; + } + + /* Restore the original signal mask after tmp process has terminated */ + if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { + pr_perror("Cannot clear blocked signals"); + goto err; + } + + if (!WIFSIGNALED(status)) { + pr_err("Expected temporary process to be terminated by a signal\n"); + goto err; + } + + if (WTERMSIG(status) != SIGKILL) { + pr_err("Expected temporary process to be terminated by SIGKILL\n"); + goto err; + } + + mutex_lock(dead_pidfd_hash_lock); + hlist_del(&dead->hash); + mutex_unlock(dead_pidfd_hash_lock); + return 0; +err: + return -1; +} + +static int open_one_pidfd(struct file_desc *d, int *new_fd) +{ + struct pidfd_info *info; + struct dead_pidfd *dead = NULL; + int pidfd; + + info = container_of(d, struct pidfd_info, d); + if (info->pidfe->nspid != -1) { + pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + goto out; + } + + dead = lookup_dead_pidfd(info->pidfe->ino); + BUG_ON(!dead); + + mutex_lock(&dead->pidfd_lock); + BUG_ON(dead->count == 0); + dead->count--; + if (dead->pid == -1) { + dead->pid = create_tmp_process(); + if (dead->pid < 0) { + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + } + + pidfd = pidfd_open(dead->pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + + if (dead->count == 0) { + if (free_dead_pidfd(dead)) { + pr_err("Failed to delete dead_pidfd struct\n"); + mutex_unlock(&dead->pidfd_lock); + close(pidfd); + goto err_close; + } + } + mutex_unlock(&dead->pidfd_lock); + +out: + if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { + goto err_close; + } + + *new_fd = pidfd; + return 0; +err_close: + pr_err("Can't create pidfd %#08x NSpid: %d flags: %u\n", + info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); + return -1; +} + +static struct file_desc_ops pidfd_desc_ops = { + .type = FD_TYPES__PIDFD, + .open = open_one_pidfd +}; + +static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct dead_pidfd *dead; + struct pidfd_info *info = obj; + + info->pidfe = pb_msg(msg, PidfdEntry); + pr_info_pidfd("Collected ", info->pidfe); + + if (info->pidfe->nspid != -1) + goto out; + + dead = lookup_dead_pidfd(info->pidfe->ino); + if (dead) { + mutex_lock(&dead->pidfd_lock); + dead->count++; + mutex_unlock(&dead->pidfd_lock); + goto out; + } + + dead = shmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate shared memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->ino = info->pidfe->ino; + dead->count = 1; + dead->pid = -1; + mutex_init(&dead->pidfd_lock); + + mutex_lock(dead_pidfd_hash_lock); + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); + mutex_unlock(dead_pidfd_hash_lock); +out: + return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); +} + +struct collect_image_info pidfd_cinfo = { + .fd_type = CR_FD_PIDFD, + .pb_type = PB_PIDFD, + .priv_size = sizeof(struct pidfd_info), + .collect = collect_one_pidfd, +}; diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 265dcf82bd..912fab24ba 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -18,6 +18,11 @@ ifeq ($(ARCH),mips) ccflags-y += -mno-abicalls -fno-pic endif +# -mshstk required for CET instructions +ifeq ($(ARCH),x86) + ccflags-y += -mshstk +endif + LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 2303f41c39..e151ed6563 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -211,6 +211,63 @@ static int dump_thread_common(struct parasite_dump_thread *ti) return ret; } +/* + * Returns a membarrier() registration command (it is a bitmask) if the process + * was registered for specified (as a bit index) membarrier()-issuing command; + * returns zero otherwise. + */ +static int get_membarrier_registration_mask(int cmd_bit) +{ + unsigned cmd = 1 << cmd_bit; + int ret; + + /* + * Issuing a barrier will be successful only if the process was registered + * for this type of membarrier. All errors are a sign that the type issued + * was not registered (EPERM) or not supported by kernel (EINVAL or ENOSYS). + */ + ret = sys_membarrier(cmd, 0, 0); + if (ret && ret != -EPERM && ret != -EINVAL && ret != -ENOSYS) { + pr_err("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + return -1; + } + pr_debug("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + /* + * For supported registrations, MEMBARRIER_CMD_REGISTER_xxx = MEMBARRIER_CMD_xxx << 1. + * See: enum membarrier_cmd in include/uapi/linux/membarrier.h in kernel sources. + */ + return ret ? 0 : cmd << 1; +} + +/* + * It would be better to check the following with BUILD_BUG_ON, but we might + * have an old linux/membarrier.h header without necessary enum values. + */ +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED 3 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE 5 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ 7 +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int dump_membarrier_compat(int *membarrier_registration_mask) +{ + int ret; + + *membarrier_registration_mask = 0; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + return 0; +} + static int dump_misc(struct parasite_dump_misc *args) { int ret; @@ -225,6 +282,19 @@ static int dump_misc(struct parasite_dump_misc *args) args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (args->has_membarrier_get_registrations) { + ret = sys_membarrier(1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + pr_err("membarrier(1 << %d) returned %d\n", MEMBARRIER_CMDBIT_GET_REGISTRATIONS, ret); + return -1; + } + args->membarrier_registration_mask = ret; + } else { + ret = dump_membarrier_compat(&args->membarrier_registration_mask); + if (ret) + return ret; + } + ret = sys_prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&args->child_subreaper, 0, 0, 0); if (ret) pr_err("PR_GET_CHILD_SUBREAPER failed (%d)\n", ret); @@ -268,6 +338,7 @@ static int dump_creds(struct parasite_dump_creds *args) } } + args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); ret = sys_getgroups(0, NULL); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 5e78e74d4f..51ed6ed4c8 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -49,7 +49,17 @@ #include "images/inventory.pb-c.h" #include "shmem.h" -#include "restorer.h" + +/* + * sys_getgroups() buffer size. Not too much, to avoid stack overflow. + */ +#define MAX_GETGROUPS_CHECKED (512 / sizeof(unsigned int)) + +/* + * Memory overhead limit for reading VMA when auto_dedup is enabled. + * An arbitrarily chosen trade-off point between speed and memory usage. + */ +#define AUTO_DEDUP_OVERHEAD_BYTES (128 << 20) #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 @@ -67,6 +77,10 @@ #define FALLOC_FL_PUNCH_HOLE 0x02 #endif +#ifndef ARCH_RT_SIGRETURN_RST +#define ARCH_RT_SIGRETURN_RST ARCH_RT_SIGRETURN +#endif + #define sys_prctl_safe(opcode, val1, val2, val3) \ ({ \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ @@ -93,7 +107,7 @@ bool fault_injected(enum faults f) * Hint: compel on aarch64 shall learn relocs for that. */ static unsigned __page_size; -unsigned page_size(void) +unsigned long page_size(void) { return __page_size; } @@ -191,22 +205,41 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ int b, i, ret; struct cap_header hdr; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; - - /* - * We're still root here and thus can do it without failures. - */ + int ruid, euid, suid, fsuid; + int rgid, egid, sgid, fsgid; /* * Setup supplementary group IDs early. */ if (args->groups) { - ret = sys_setgroups(ce->n_groups, args->groups); - if (ret) { - pr_err("Can't setup supplementary group IDs: %d\n", ret); - return -1; + /* + * We may be in an unprivileged user namespace where setgroups + * is disabled. If the current list of groups is already what + * we want, skip the call to setgroups. + */ + unsigned int gids[MAX_GETGROUPS_CHECKED]; + int n = sys_getgroups(MAX_GETGROUPS_CHECKED, gids); + if (n != ce->n_groups || memcmp(gids, args->groups, n * sizeof(*gids))) { + ret = sys_setgroups(ce->n_groups, args->groups); + if (ret) { + pr_err("Can't setgroups([%zu gids]): %d\n", ce->n_groups, ret); + return -1; + } } } + /* + * Compare xids with current values. If all match then we can skip + * setting them (which requires extra capabilities). + */ + fsuid = sys_setfsuid(-1); + fsgid = sys_setfsgid(-1); + if (sys_getresuid(&ruid, &euid, &suid) == 0 && sys_getresgid(&rgid, &egid, &sgid) == 0 && ruid == ce->uid && + euid == ce->euid && suid == ce->suid && rgid == ce->gid && egid == ce->egid && sgid == ce->sgid && + fsuid == ce->fsuid && fsgid == ce->fsgid) { + goto skip_xids; + } + /* * First -- set the SECURE_NO_SETUID_FIXUP bit not to * lose caps bits when changing xids. @@ -250,12 +283,13 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ return -1; } +skip_xids: /* * Third -- restore securebits. We don't need them in any * special state any longer. */ - if (!uid) { + if (sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0) != ce->secbits) { ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); @@ -276,10 +310,18 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ /* already set */ continue; ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); - if (ret) { + if (!ret) + continue; + if (!ce->has_no_new_privs || !ce->no_new_privs || args->cap_prm[b] & (1 << i)) { pr_err("Unable to drop capability %d: %d\n", i + b * 32, ret); return -1; } + /* + * If prctl(NO_NEW_PRIVS) is going to be set then it + * will prevent inheriting the capabilities not in + * the permitted set. + */ + pr_warn("Unable to drop capability %d from bset: %d (but NO_NEW_PRIVS will drop it)\n", i + b * 32, ret); } } @@ -320,6 +362,14 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ if (lsm_set_label(args->lsm_sockcreate, "sockcreate", procfd) < 0) return -1; + if (ce->has_no_new_privs && ce->no_new_privs) { + ret = sys_prctl(PR_SET_NO_NEW_PRIVS, ce->no_new_privs, 0, 0, 0); + if (ret) { + pr_err("Unable to set no_new_privs=%d: %d\n", ce->no_new_privs, ret); + return -1; + } + } + return 0; } @@ -584,7 +634,7 @@ static int restore_thread_common(struct thread_restore_args *args) static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sigframe) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_RST(new_sp, sigframe); } static int send_cg_set(int sk, int cg_set) @@ -688,7 +738,7 @@ static int recv_cg_set_restore_ack(int sk) * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. */ -long __export_restore_thread(struct thread_restore_args *args) +__visible long __export_restore_thread(struct thread_restore_args *args) { struct rt_sigframe *rt_sigframe; k_rtsigset_t to_block; @@ -701,6 +751,10 @@ long __export_restore_thread(struct thread_restore_args *args) goto core_restore_end; } + /* restore original shadow stack */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + /* All signals must be handled by thread leader */ ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); @@ -1068,7 +1122,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) * |G|----tgt----| | * * 3. remap src to any other place. - * G prevents src from being remaped on tgt again + * G prevents src from being remapped on tgt again * | |-------------| -> |+++++src+++++| * |G|---tgt-----| | * @@ -1169,7 +1223,7 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { int ret, i; - kernel_timer_t next_id; + kernel_timer_t next_id = 0, timer_id; struct sigevent sev; for (i = 0; i < args->posix_timers_n; i++) { @@ -1183,25 +1237,26 @@ static int create_posix_timers(struct task_restore_args *args) sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; while (1) { - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); return ret; } - if (next_id == args->posix_timers[i].spt.it_id) + if (timer_id != next_id) { + pr_err("Can't create timers, kernel don't give them consequently\n"); + return -1; + } + next_id++; + + if (timer_id == args->posix_timers[i].spt.it_id) break; - ret = sys_timer_delete(next_id); + ret = sys_timer_delete(timer_id); if (ret < 0) { - pr_err("Can't remove temporaty posix timer 0x%x\n", next_id); + pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); return ret; } - - if ((long)next_id > args->posix_timers[i].spt.it_id) { - pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; - } } } @@ -1228,7 +1283,7 @@ unsigned long vdso_rt_size = 0; void *bootstrap_start = NULL; unsigned int bootstrap_len = 0; -void __export_unmap(void) +__visible void __export_unmap(void) { sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } @@ -1435,6 +1490,40 @@ static int fd_poll(int inotify_fd) return sys_ppoll(&pfd, 1, &tmo, NULL, sizeof(sigset_t)); } +/* + * Call preadv() but limit size of the read. Zero `max_to_read` skips the limit. + */ +static ssize_t preadv_limited(int fd, struct iovec *iovs, int nr, off_t offs, size_t max_to_read) +{ + size_t saved_last_iov_len = 0; + ssize_t ret; + + if (max_to_read) { + for (int i = 0; i < nr; ++i) { + if (iovs[i].iov_len <= max_to_read) { + max_to_read -= iovs[i].iov_len; + continue; + } + + if (!max_to_read) { + nr = i; + break; + } + + saved_last_iov_len = iovs[i].iov_len; + iovs[i].iov_len = max_to_read; + nr = i + 1; + break; + } + } + + ret = sys_preadv(fd, iovs, nr, offs); + if (saved_last_iov_len) + iovs[nr - 1].iov_len = saved_last_iov_len; + + return ret; +} + /* * In the worst case buf size should be: * sizeof(struct inotify_event) * 2 + PATH_MAX @@ -1495,6 +1584,30 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) return 0; } +/* + * Restore membarrier() registrations. + */ +static int restore_membarrier_registrations(int mask) +{ + unsigned long bitmap[1] = { mask }; + int i, err, ret = 0; + + if (!mask) + return 0; + + pr_info("Restoring membarrier() registrations %x\n", mask); + + for_each_bit(i, bitmap) { + err = sys_membarrier(1 << i, 0, 0); + if (!err) + continue; + pr_err("Can't restore membarrier(1 << %d) registration: %d\n", i, err); + ret = -1; + } + + return ret; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1502,7 +1615,7 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) * and jump execution to some predefined ip read from * core file. */ -long __export_restore_task(struct task_restore_args *args) +__visible long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; @@ -1562,6 +1675,9 @@ long __export_restore_task(struct task_restore_args *args) pr_debug("lazy-pages: uffd %d\n", args->uffd); } + if (arch_shstk_switch_to_restorer(&args->shstk)) + goto core_restore_end; + /* * Park vdso/vvar in a safe place if architecture doesn't support * mapping them with arch_prctl(). @@ -1613,6 +1729,13 @@ long __export_restore_task(struct task_restore_args *args) if (vma_entry->start > vma_entry->shmid) break; + /* + * shadow stack VMAs cannot be remapped, they must be + * recreated with map_shadow_stack system call + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) + continue; + if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1630,21 +1753,24 @@ long __export_restore_task(struct task_restore_args *args) if (vma_entry->start < vma_entry->shmid) break; + /* + * shadow stack VMAs cannot be remapped, they must be + * recreated with map_shadow_stack system call + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) + continue; + if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } - if (args->uffd > -1) { - /* re-enable THP if we disabled it previously */ - if (args->has_thp_enabled) { - int ret; - ret = sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0); - if (ret) { - pr_err("Cannot re-enable THP: %d\n", ret); - goto core_restore_end; - } - } + ret = sys_prctl(PR_SET_THP_DISABLE, args->thp_disabled, 0, 0, 0); + if (ret) { + pr_err("Cannot restore THP_DISABLE=%d flag: %ld\n", args->thp_disabled, ret); + goto core_restore_end; + } + if (args->uffd > -1) { pr_debug("lazy-pages: closing uffd %d\n", args->uffd); /* * All userfaultfd configuration has finished at this point. @@ -1686,7 +1812,12 @@ long __export_restore_task(struct task_restore_args *args) while (nr) { pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); - r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); + /* + * If we're requested to punch holes in the file after reading we do + * it to save memory. Limit the reads then to an arbitrary block size. + */ + r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, + args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); if (r < 0) { pr_err("Can't read pages data (%d)\n", (int)r); goto core_restore_end; @@ -1985,6 +2116,9 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } + if (restore_membarrier_registrations(args->membarrier_registration_mask) < 0) + goto core_restore_end; + pr_info("%ld: Restored\n", sys_getpid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE); @@ -2052,6 +2186,14 @@ long __export_restore_task(struct task_restore_args *args) futex_set_and_wake(&thread_inprogress, args->nr_threads); + /* + * Shadow stack of the leader can be locked only after all other + * threads were cloned, otherwise they may start with read-only + * shadow stack. + */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); if (ret) diff --git a/criu/pipes.c b/criu/pipes.c index 43ff06e3d8..daada88306 100644 --- a/criu/pipes.c +++ b/criu/pipes.c @@ -434,7 +434,7 @@ int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms /* steal_pipe has to be able to fit all data from a target pipe */ if (fcntl(steal_pipe[1], F_SETPIPE_SZ, pipe_size) < 0) { pr_perror("Unable to set a pipe size"); - goto err; + goto err_close; } bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); diff --git a/criu/plugin.c b/criu/plugin.c index f3fea28566..65e79a0692 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -57,6 +57,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma"); __assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map"); __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); + __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); + __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); #undef __assign_hook @@ -254,6 +256,9 @@ int cr_plugin_init(int stage) goto err; } + if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins()) + goto err; + exit_code = 0; err: closedir(d); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 5e96b5c963..95ebe3a411 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,10 +42,12 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "pidfd.pb-c.h" #include "plugin.h" #include @@ -118,7 +120,8 @@ bool handle_vma_plugin(int *fd, struct stat *stat) return true; } -static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) +static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, + int *shstk) { char *tok; @@ -162,6 +165,9 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) *io_pf = 1; + if (_vmflag_match(tok, "ss")) + *shstk = 1; + /* * Anything else is just ignored. */ @@ -172,14 +178,21 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { - __parse_vmflags(buf, flags, madv, io_pf); + int shstk = 0; + + __parse_vmflags(buf, flags, madv, io_pf, &shstk); } static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) { int io_pf = 0; + int shstk = 0; - __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf); + __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf, + &shstk); + + if (shstk) + vma_area->e->status |= VMA_AREA_SHSTK; /* * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the @@ -338,7 +351,7 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("Can't open mapped [%s]", fname); - goto returnerr; + return -1; } if (vma_stat(vma, fd)) { @@ -379,7 +392,6 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); closefd: close(fd); -returnerr: return -1; } @@ -842,6 +854,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du goto err; } + pr_debug("Handling VMA with the following smaps entry: %s\n", str); if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; @@ -1407,7 +1420,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -1972,10 +1985,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) " pos:%lli ino:%lx sdev:%x", &e->tfd, &e->events, (long long *)&e->data, (long long *)&e->pos, (long *)&e->inode, &e->dev); - if (ret < 3 || ret > 6) { - eventpoll_tfd_entry__free_unpacked(e, NULL); - goto parse_err; - } else if (ret == 3) { + if (ret == 3) { e->has_dev = false; e->has_inode = false; e->has_pos = false; @@ -1983,7 +1993,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) e->has_dev = true; e->has_inode = true; e->has_pos = true; - } else if (ret < 6) { + } else { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } @@ -2157,6 +2167,33 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { + struct pidfd_dump_info *pidfd_info = arg; + + if (type != FD_TYPES__PIDFD) + continue; + + if (fdinfo_field(str, "ino")) { + ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "Pid")) { + ret = sscanf(str, "%*s %d", &pidfd_info->pid); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "NSpid")) { + char *last; + + last = strrchr(str, '\t'); + if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto parse_err; + } + } + entry_met = true; continue; } @@ -2208,10 +2245,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5be..e0dbfccc21 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; diff --git a/criu/seize.c b/criu/seize.c index 91090ae1a7..edeb57cc8a 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -16,6 +16,7 @@ #include "pstree.h" #include "criu-log.h" #include +#include "plugin.h" #include "proc_parse.h" #include "seccomp.h" #include "seize.h" @@ -24,6 +25,19 @@ #include "xmalloc.h" #include "util.h" +static bool freeze_cgroup_disabled; + +/* + * Disables the use of freeze cgroups for process seizing, even if explicitly + * requested via the --freeze-cgroup option. This is necessary for plugins + * (e.g., CUDA) that do not function correctly when processes are frozen using + * cgroups. + */ +void __attribute__((used)) dont_use_freeze_cgroup(void) +{ + freeze_cgroup_disabled = true; +} + char *task_comm_info(pid_t pid, char *comm, size_t size) { bool is_read = false; @@ -396,7 +410,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup) + if (!opts.freeze_cgroup || freeze_cgroup_disabled) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -491,6 +505,31 @@ static int log_unfrozen_stacks(char *root) return 0; } +static int check_freezer_cgroup(void) +{ + enum freezer_state state = THAWED; + int fd; + + BUG_ON(!freeze_cgroup_disabled); + + fd = freezer_open(); + if (fd < 0) + return -1; + + state = get_freezer_state(fd); + close(fd); + if (state == FREEZER_ERROR) { + return -1; + } + + if (state != THAWED) { + pr_err("One or more plugins are incompatible with the freezer cgroup in the FROZEN state.\n"); + return -1; + } + + return 0; +} + static int freeze_processes(void) { int fd, exit_code = -1; @@ -637,7 +676,12 @@ static int collect_children(struct pstree_item *item) goto free; } - if (!opts.freeze_cgroup) + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto free; + } + + if (!opts.freeze_cgroup || freeze_cgroup_disabled) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -825,7 +869,8 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) + if ((!opts.freeze_cgroup || freeze_cgroup_disabled) && + compel_interrupt_task(pid)) continue; ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL); @@ -881,7 +926,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup) + if (opts.freeze_cgroup && !freeze_cgroup_disabled) attempts = 1; /* @@ -966,6 +1011,7 @@ int collect_pstree(void) pid_t pid = root_item->pid->real; int ret = -1; struct proc_status_creds creds; + struct pstree_item *iter; timing_start(TIME_FREEZING); @@ -976,17 +1022,26 @@ int collect_pstree(void) */ alarm(opts.timeout); + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (opts.freeze_cgroup && cgroup_version()) goto err; pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && freeze_processes()) - goto err; - - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { - set_cr_errno(ESRCH); - goto err; + if (opts.freeze_cgroup && !freeze_cgroup_disabled) { + if (freeze_processes()) + goto err; + } else { + if (opts.freeze_cgroup && check_freezer_cgroup()) + goto err; + if (compel_interrupt_task(pid)) { + set_cr_errno(ESRCH); + goto err; + } } ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL); @@ -1012,11 +1067,20 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && freezer_wait_processes()) { + if (opts.freeze_cgroup && !freeze_cgroup_disabled && + freezer_wait_processes()) { ret = -1; goto err; } + for_each_pstree_item(iter) { + if (!task_alive(iter)) + continue; + ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } + ret = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); diff --git a/criu/shmem.c b/criu/shmem.c index c13a39b660..9e3178352d 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,23 +206,28 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) +static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; + u64 vaddr; if (!is_shmem_tracking_en()) return; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); - for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { - if (!should_dump_page(vma, map[vma_pfn])) + for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { + bool softdirty = false; + u64 next; + + next = should_dump_page(pmc, vma, vaddr, &softdirty); + if (next != vaddr) { + vaddr = next - PAGE_SIZE; continue; + } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (map[vma_pfn] & PME_SOFT_DIRTY) + if (softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); - else if (page_is_zero(map[vma_pfn])) - set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } @@ -648,7 +653,7 @@ static int open_shmem(int pid, struct vma_area *vma) return -1; } -int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) +int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) { struct shmem_info *si; unsigned long size = vma->pgoff + (vma->end - vma->start); @@ -662,7 +667,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } @@ -679,7 +684,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } diff --git a/criu/sigact.c b/criu/sigact.c new file mode 100644 index 0000000000..5174644d28 --- /dev/null +++ b/criu/sigact.c @@ -0,0 +1,319 @@ +#include "types.h" +#include "infect.h" +#include "protobuf.h" +#include "pstree.h" +#include "parasite.h" +#include "restorer.h" +#include "sigact.h" + +/* + * If parent's sigaction has blocked SIGKILL (which is non-sense), + * this parent action is non-valid and shouldn't be inherited. + * Used to mark parent_act* no more valid. + */ +static rt_sigaction_t parent_act[SIGMAX]; +#ifdef CONFIG_COMPAT +static rt_sigaction_t_compat parent_act_compat[SIGMAX]; +#endif + +static bool sa_inherited(int sig, rt_sigaction_t *sa) +{ + rt_sigaction_t *pa; + int i; + + if (current == root_item) + return false; /* XXX -- inherit from CRIU? */ + + pa = &parent_act[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static void *stack32; +rt_sigaction_t sigchld_act; + +#ifdef CONFIG_COMPAT +static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) +{ + rt_sigaction_t_compat *pa; + int i; + + if (current == root_item) + return false; + + pa = &parent_act_compat[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t_compat act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); + + if (sig == SIGCHLD) { + memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); + return 0; + } + + if (sa_compat_inherited(sig - 1, &act)) + return 1; + + if (!stack32) { + stack32 = alloc_compat_syscall_stack(); + if (!stack32) + return -1; + } + + ret = arch_compat_rt_sigaction(stack32, sig, &act); + if (ret < 0) { + pr_err("Can't restore compat sigaction: %d\n", ret); + return ret; + } + + parent_act_compat[sig - 1] = act; + /* Mark SIGKILL blocked which makes native sigaction non-valid */ + parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; + + return 1; +} +#else +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + return -1; +} +#endif + +static int restore_native_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); +#ifdef CONFIG_MIPS + e->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); + + memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); + memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); +#else + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); +#endif + if (sig == SIGCHLD) { + sigchld_act = act; + return 0; + } + + if (sa_inherited(sig - 1, &act)) + return 1; + + /* + * A pure syscall is used, because glibc + * sigaction overwrites se_restorer. + */ + ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); + if (ret < 0) { + pr_perror("Can't restore sigaction"); + return ret; + } + + parent_act[sig - 1] = act; + /* Mark SIGKILL blocked which makes compat sigaction non-valid */ +#ifdef CONFIG_COMPAT + parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; +#endif + + return 1; +} + +static int prepare_sigactions_from_core(TaskCoreEntry *tc) +{ + int sig, i; + + if (tc->n_sigactions != SIGMAX - 2) { + pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); + return -1; + } + + pr_info("Restore on-core sigactions for %d\n", vpid(current)); + + for (sig = 1, i = 0; sig <= SIGMAX; sig++) { + int ret; + SaEntry *e; + bool sigaction_is_compat; + + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + e = tc->sigactions[i++]; + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + if (ret < 0) + return ret; + } + + return 0; +} + +/* Returns number of restored signals, -1 or negative errno on fail */ +static int restore_one_sigaction(int sig, struct cr_img *img, int pid) +{ + bool sigaction_is_compat; + SaEntry *e; + int ret = 0; + + BUG_ON(sig == SIGKILL || sig == SIGSTOP); + + ret = pb_read_one_eof(img, &e, PB_SIGACT); + if (ret == 0) { + if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ + pr_err("Unexpected EOF %d\n", sig); + return -1; + } + pr_warn("This format of sigacts-%d.img is deprecated\n", pid); + return -1; + } + if (ret < 0) + return ret; + + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + sa_entry__free_unpacked(e, NULL); + + return ret; +} + +static int prepare_sigactions_from_image(void) +{ + int pid = vpid(current); + struct cr_img *img; + int sig, rst = 0; + int ret = 0; + + pr_info("Restore sigacts for %d\n", pid); + + img = open_image(CR_FD_SIGACT, O_RSTR, pid); + if (!img) + return -1; + + for (sig = 1; sig <= SIGMAX; sig++) { + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + ret = restore_one_sigaction(sig, img, pid); + if (ret < 0) + break; + if (ret) + rst++; + } + + pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); + + close_image(img); + return ret; +} + +int prepare_sigactions(CoreEntry *core) +{ + int ret; + + if (!task_alive(current)) + return 0; + + if (core->tc->n_sigactions != 0) + ret = prepare_sigactions_from_core(core->tc); + else + ret = prepare_sigactions_from_image(); + + if (stack32) { + free_compat_syscall_stack(stack32); + stack32 = NULL; + } + + return ret; +} + +int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + TaskCoreEntry *tc = item->core[0]->tc; + struct parasite_dump_sa_args *args; + int ret, sig; + SaEntry *sa, **psa; + + args = compel_parasite_args(ctl, struct parasite_dump_sa_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); + if (ret < 0) + return ret; + + psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); + if (!psa) + return -1; + + sa = (SaEntry *)(psa + SIGMAX - 2); + + tc->n_sigactions = SIGMAX - 2; + tc->sigactions = psa; + + for (sig = 1; sig <= SIGMAX; sig++) { + int i = sig - 1; + + if (sig == SIGSTOP || sig == SIGKILL) + continue; + + sa_entry__init(sa); + ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); + ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); + ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); +#ifdef CONFIG_MIPS + sa->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); + memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); +#else + BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); +#endif + sa->has_compat_sigaction = true; + sa->compat_sigaction = !compel_mode_native(ctl); + + *(psa++) = sa++; + } + + return 0; +} diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 4bd5abff17..92f53e5697 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -416,9 +416,13 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io } else { ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + ret |= dump_opt(sk, SOL_IP, IP_TOS, &ioe->tos); + ret |= dump_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); } ioe->has_freebind = ioe->freebind; ioe->has_pktinfo = !!ioe->pktinfo; + ioe->has_tos = !!ioe->tos; + ioe->has_ttl = !!ioe->ttl; return ret; } @@ -450,6 +454,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT; IpOptsRawEntry ipopts_raw = IP_OPTS_RAW_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; + TcpOptsEntry tcpopts = TCP_OPTS_ENTRY__INIT; int ret = -1, err = -1, proto, aux, type; ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL, &proto, sizeof(proto)); @@ -517,6 +522,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa ie.opts = &skopts; ie.ip_opts = &ipopts; ie.ip_opts->raw = &ipopts_raw; + ie.tcp_opts = &tcpopts; ie.n_src_addr = PB_ALEN_INET; ie.n_dst_addr = PB_ALEN_INET; @@ -577,9 +583,20 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa switch (proto) { case IPPROTO_TCP: - err = (type != SOCK_RAW) ? dump_one_tcp(lfd, sk, &skopts) : 0; if (sk->shutdown) sk_encode_shutdown(&ie, sk->shutdown); + + if (type == SOCK_RAW) { + err = 0; + } else { + err = dump_tcp_opts(lfd, &tcpopts); + if (err < 0) + goto err; + + err = dump_one_tcp(lfd, sk, &skopts); + if (err < 0) + goto err; + } break; case IPPROTO_UDP: case IPPROTO_UDPLITE: @@ -813,6 +830,10 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); if (ioe->has_pktinfo) ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + if (ioe->has_tos) + ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); + if (ioe->has_ttl) + ret |= restore_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); } if (ioe->raw) @@ -931,6 +952,9 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) if (restore_socket_opts(sk, ie->opts)) goto err; + if (ie->proto == IPPROTO_TCP && restore_tcp_opts(sk, ie->tcp_opts)) + goto err; + if (ie->has_shutdown && (ie->proto == IPPROTO_UDP || ie->proto == IPPROTO_UDPLITE || ie->proto == IPPROTO_TCP)) { if (shutdown(sk, sk_decode_shutdown(ie->shutdown))) { diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 96d5d13bf6..9c8bad1c3f 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -39,6 +39,8 @@ static int lock_connection(struct inet_sk_desc *sk) return iptables_lock_connection(sk); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) return nftables_lock_connection(sk); + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -50,6 +52,8 @@ static int unlock_connection(struct inet_sk_desc *sk) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -131,7 +135,8 @@ void cpt_unlock_tcp_connections(void) static int dump_tcp_conn_state(struct inet_sk_desc *sk) { struct libsoccr_sk *socr = sk->priv; - int ret, aux; + int exit_code = -1; + int ret; struct cr_img *img; TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; char *buf; @@ -140,11 +145,11 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) ret = libsoccr_save(socr, &data, sizeof(data)); if (ret < 0) { pr_err("libsoccr_save() failed with %d\n", ret); - goto err_r; + goto err; } if (ret != sizeof(data)) { pr_err("This libsocr is not supported (%d vs %d)\n", ret, (int)sizeof(data)); - goto err_r; + goto err; } sk->state = data.state; @@ -181,43 +186,22 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) tse.rcv_wup = data.rcv_wup; } - /* - * TCP socket options - */ - - if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) - goto err_opt; - - if (aux) { - tse.has_nodelay = true; - tse.nodelay = true; - } - - if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) - goto err_opt; - - if (aux) { - tse.has_cork = true; - tse.cork = true; - } - /* * Push the stuff to image */ - img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino); if (!img) - goto err_img; + goto err; ret = pb_write_one(img, &tse, PB_TCP_STREAM); if (ret < 0) - goto err_iw; + goto err_close; buf = libsoccr_get_queue_bytes(socr, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); if (buf) { ret = write_img_buf(img, buf, tse.inq_len); if (ret < 0) - goto err_iw; + goto err_close; xfree(buf); } @@ -226,40 +210,40 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) if (buf) { ret = write_img_buf(img, buf, tse.outq_len); if (ret < 0) - goto err_iw; + goto err_close; xfree(buf); } pr_info("Done\n"); -err_iw: + exit_code = 0; +err_close: close_image(img); -err_img: -err_opt: -err_r: - return ret; +err: + return exit_code; } -int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) +int dump_tcp_opts(int fd, TcpOptsEntry *toe) { - soe->has_tcp_keepcnt = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt)) { - pr_perror("Can't read TCP_KEEPCNT"); - return -1; - } + int ret = 0; - soe->has_tcp_keepidle = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &soe->tcp_keepidle)) { - pr_perror("Can't read TCP_KEEPIDLE"); - return -1; - } + ret |= dump_opt(fd, SOL_TCP, TCP_NODELAY, &toe->nodelay); + ret |= dump_opt(fd, SOL_TCP, TCP_CORK, &toe->cork); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); - soe->has_tcp_keepintvl = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &soe->tcp_keepintvl)) { - pr_perror("Can't read TCP_KEEPINTVL"); - return -1; - } + toe->has_nodelay = !!toe->nodelay; + toe->has_cork = !!toe->cork; + toe->has_keepcnt = !!toe->keepcnt; + toe->has_keepidle = !!toe->keepidle; + toe->has_keepintvl = !!toe->keepintvl; + return ret; +} + +int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) +{ if (sk->dst_port == 0) return 0; @@ -393,6 +377,11 @@ static int restore_tcp_conn_state(int sk, struct libsoccr_sk *socr, struct inet_ if (libsoccr_restore(socr, &data, sizeof(data))) goto err_c; + /* + * Restoring TCP socket options in TcpStreamEntry is + * for backward compatibility only, newer versions + * of CRIU use TcpOptsEntry. + */ if (tse->has_nodelay && tse->nodelay) { aux = 1; if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux)) @@ -445,6 +434,27 @@ int prepare_tcp_socks(struct task_restore_args *ta) return 0; } +int restore_tcp_opts(int sk, TcpOptsEntry *toe) +{ + int ret = 0; + + if(!toe) + return ret; + + if (toe->has_nodelay) + ret |= restore_opt(sk, SOL_TCP, TCP_NODELAY, &toe->nodelay); + if (toe->has_cork) + ret |= restore_opt(sk, SOL_TCP, TCP_CORK, &toe->cork); + if (toe->has_keepcnt) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); + if (toe->has_keepidle) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); + if (toe->has_keepintvl) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); + + return ret; +} + int restore_one_tcp(int fd, struct inet_sk_info *ii) { struct libsoccr_sk *sk; @@ -483,6 +493,8 @@ static int unlock_connection_info(struct inet_sk_info *si) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 841152643a..70ca16be4a 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -878,7 +878,8 @@ static int __dump_external_socket(struct unix_sk_desc *sk, struct unix_sk_desc * if (peer->type != SOCK_DGRAM) { show_one_unix("Ext stream not supported", peer); - pr_err("Can't dump half of stream unix connection.\n"); + pr_err("Can't dump half of stream unix connection. name: %s; peer name: %s\n", + sk->name, peer->name); return -1; } @@ -1430,32 +1431,22 @@ static int post_open_standalone(struct file_desc *d, int fd) static int restore_file_perms(struct unix_sk_info *ui) { - if (ui->ue->file_perms) { - FilePermsEntry *perms = ui->ue->file_perms; - char fname[PATH_MAX]; + FilePermsEntry *perms = ui->ue->file_perms; + char fname[PATH_MAX]; - if (ui->ue->name.len >= sizeof(fname)) { - pr_err("The file name is too long\n"); - return -E2BIG; - } - - memcpy(fname, ui->name, ui->ue->name.len); - fname[ui->ue->name.len] = '\0'; - - if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file owner and group"); - return -errno_cpy; - } + if (!perms) + return 0; - if (fchmodat(AT_FDCWD, fname, perms->mode, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file mode bits"); - return -errno_cpy; - } + if (ui->ue->name.len >= sizeof(fname)) { + pr_err("The file name is too long\n"); + errno = -E2BIG; + return -1; } - return 0; + memcpy(fname, ui->name, ui->ue->name.len); + fname[ui->ue->name.len] = '\0'; + + return cr_fchpermat(AT_FDCWD, fname, perms->uid, perms->gid, perms->mode, 0); } static int keep_deleted(struct unix_sk_info *ui) diff --git a/criu/sockets.c b/criu/sockets.c index d17e0a9869..f9ce999bed 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -38,7 +38,7 @@ #define SOCK_DIAG_BY_FAMILY 20 #endif -#define SK_HASH_SIZE 32 +#define SK_HASH_SIZE (1 << 14) #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER @@ -585,6 +585,12 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_debug("\tset keepalive for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_KEEPALIVE, &val); } + + /* + * Restoring TCP socket options in SkOptsEntry is + * for backward compatibility only, newer versions + * of CRIU use TcpOptsEntry. + */ if (soe->has_tcp_keepcnt) { pr_debug("\tset keepcnt for socket\n"); ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt); diff --git a/criu/timens.c b/criu/timens.c index 66c0c02a42..257782e5a5 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -96,8 +96,8 @@ int prepare_timens(int id) ts.tv_nsec = te->monotonic->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: monotonic %ld %ld\n", ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %ld %ld\n", CLOCK_MONOTONIC, ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: monotonic %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_MONOTONIC, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a monotonic clock offset"); goto err; } @@ -111,8 +111,8 @@ int prepare_timens(int id) ts.tv_nsec = te->boottime->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: boottime %ld %ld\n", ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %ld %ld\n", CLOCK_BOOTTIME, ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: boottime %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_BOOTTIME, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a boottime clock offset"); goto err; } diff --git a/criu/timer.c b/criu/timer.c new file mode 100644 index 0000000000..e94cf0280d --- /dev/null +++ b/criu/timer.c @@ -0,0 +1,400 @@ +#include "types.h" +#include "crtools.h" +#include "infect.h" +#include "protobuf.h" +#include "pstree.h" +#include "posix-timer.h" +#include "parasite.h" +#include "namespaces.h" +#include "rst-malloc.h" +#include "restorer.h" + +static inline int timeval_valid(struct timeval *tv) +{ + return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); +} + +static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) +{ + if (ie->isec == 0 && ie->iusec == 0) { + memzero_p(val); + return 0; + } + + val->it_interval.tv_sec = ie->isec; + val->it_interval.tv_usec = ie->iusec; + + if (!timeval_valid(&val->it_interval)) { + pr_err("Invalid timer interval\n"); + return -1; + } + + if (ie->vsec == 0 && ie->vusec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + val->it_value.tv_sec = ie->isec; + val->it_value.tv_usec = ie->iusec; + } else { + val->it_value.tv_sec = ie->vsec; + val->it_value.tv_usec = ie->vusec; + } + + if (!timeval_valid(&val->it_value)) { + pr_err("Invalid timer value\n"); + return -1; + } + + pr_info("Restored %s timer to %" PRId64 ".%" PRId64 " -> %" PRId64 ".%" PRId64 "\n", n, + (int64_t)val->it_value.tv_sec, (int64_t)val->it_value.tv_usec, + (int64_t)val->it_interval.tv_sec, (int64_t)val->it_interval.tv_usec); + + return 0; +} + +/* + * Legacy itimers restore from CR_FD_ITIMERS + */ + +int prepare_itimers_from_fd(int pid, struct task_restore_args *args) +{ + int ret = -1; + struct cr_img *img; + ItimerEntry *ie; + + if (!deprecated_ok("Itimers")) + return -1; + + img = open_image(CR_FD_ITIMERS, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("real", ie, &args->itimers[0]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("virt", ie, &args->itimers[1]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("prof", ie, &args->itimers[2]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; +out: + close_image(img); + return ret; +} + +int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) +{ + int ret = 0; + TaskTimersEntry *tte = core->tc->timers; + + if (!tte) + return prepare_itimers_from_fd(pid, args); + + ret |= decode_itimer("real", tte->real, &args->itimers[0]); + ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); + ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); + + return ret; +} + +static inline int timespec_valid(struct timespec *ts) +{ + return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); +} + +static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) +{ + pt->val.it_interval.tv_sec = pte->isec; + pt->val.it_interval.tv_nsec = pte->insec; + + if (!timespec_valid(&pt->val.it_interval)) { + pr_err("Invalid timer interval(posix)\n"); + return -1; + } + + if (pte->vsec == 0 && pte->vnsec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + pt->val.it_value.tv_sec = pte->isec; + pt->val.it_value.tv_nsec = pte->insec; + } else { + pt->val.it_value.tv_sec = pte->vsec; + pt->val.it_value.tv_nsec = pte->vnsec; + } + + if (!timespec_valid(&pt->val.it_value)) { + pr_err("Invalid timer value(posix)\n"); + return -1; + } + + pt->spt.it_id = pte->it_id; + pt->spt.clock_id = pte->clock_id; + pt->spt.si_signo = pte->si_signo; + pt->spt.it_sigev_notify = pte->it_sigev_notify; + pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); + pt->spt.notify_thread_id = pte->notify_thread_id; + pt->overrun = pte->overrun; + + return 0; +} + +static int cmp_posix_timer_proc_id(const void *p1, const void *p2) +{ + return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; +} + +static void sort_posix_timers(struct task_restore_args *ta) +{ + void *tmem; + + /* + * This is required for restorer's create_posix_timers(), + * it will probe them one-by-one for the desired ID, since + * kernel doesn't provide another API for timer creation + * with given ID. + */ + + if (ta->posix_timers_n > 0) { + tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); + qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); + } +} + +/* + * Legacy posix timers restoration from CR_FD_POSIX_TIMERS + */ + +int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) +{ + struct cr_img *img; + int ret = -1; + struct restore_posix_timer *t; + + if (!deprecated_ok("Posix timers")) + return -1; + + img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); + if (!img) + return -1; + + ta->posix_timers_n = 0; + while (1) { + PosixTimerEntry *pte; + + ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); + if (ret <= 0) + break; + + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + break; + + ret = decode_posix_timer(pte, t); + if (ret < 0) + break; + + posix_timer_entry__free_unpacked(pte, NULL); + ta->posix_timers_n++; + } + + close_image(img); + if (!ret) + sort_posix_timers(ta); + + return ret; +} + +int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) +{ + int i, ret = -1; + TaskTimersEntry *tte = core->tc->timers; + struct restore_posix_timer *t; + + ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); + + if (!tte) + return prepare_posix_timers_from_fd(pid, ta); + + ta->posix_timers_n = tte->n_posix; + for (i = 0; i < ta->posix_timers_n; i++) { + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + goto out; + + if (decode_posix_timer(tte->posix[i], t)) + goto out; + } + + ret = 0; + sort_posix_timers(ta); +out: + return ret; +} + +static void encode_itimer(struct itimerval *v, ItimerEntry *ie) +{ + ie->isec = v->it_interval.tv_sec; + ie->iusec = v->it_interval.tv_usec; + ie->vsec = v->it_value.tv_sec; + ie->vusec = v->it_value.tv_usec; +} + +int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + struct parasite_dump_itimers_args *args; + int ret; + + args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); + if (ret < 0) + return ret; + + encode_itimer((&args->real), (core->tc->timers->real)); + encode_itimer((&args->virt), (core->tc->timers->virt)); + encode_itimer((&args->prof), (core->tc->timers->prof)); + + return 0; +} + +static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) +{ + int sz; + + /* + * Will be free()-ed in core_entry_free() + */ + + sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); + tte->posix = xmalloc(sz); + if (!tte->posix) + return -1; + + tte->n_posix = n; + *pte = (PosixTimerEntry *)(tte->posix + n); + return 0; +} + +static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) +{ + pid_t vtid = 0; + int i; + + if (rtid == 0) + return 0; + + if (!(root_ns_mask & CLONE_NEWPID)) { + /* Non-pid-namespace case */ + pte->notify_thread_id = rtid; + pte->has_notify_thread_id = true; + return 0; + } + + /* Pid-namespace case */ + if (!kdat.has_nspid) { + pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + if (item->threads[i].real != rtid) + continue; + + vtid = item->threads[i].ns[0].virt; + break; + } + + if (vtid == 0) { + pr_err("Unable to convert the notify thread id %d\n", rtid); + return -1; + } + + pte->notify_thread_id = vtid; + pte->has_notify_thread_id = true; + return 0; +} + +static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, + PosixTimerEntry *pte) +{ + pte->it_id = vp->spt.it_id; + pte->clock_id = vp->spt.clock_id; + pte->si_signo = vp->spt.si_signo; + pte->it_sigev_notify = vp->spt.it_sigev_notify; + pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); + + pte->overrun = v->overrun; + + pte->isec = v->val.it_interval.tv_sec; + pte->insec = v->val.it_interval.tv_nsec; + pte->vsec = v->val.it_value.tv_sec; + pte->vnsec = v->val.it_value.tv_nsec; + + if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) + return -1; + + return 0; +} + +int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + TaskTimersEntry *tte = core->tc->timers; + PosixTimerEntry *pte; + struct proc_posix_timer *temp; + struct parasite_dump_posix_timers_args *args; + int ret, exit_code = -1; + int args_size; + int i; + + if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) + return -1; + + args_size = posix_timers_dump_size(proc_args->timer_n); + args = compel_parasite_args_s(ctl, args_size); + args->timer_n = proc_args->timer_n; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + args->timer[i].it_id = temp->spt.it_id; + i++; + } + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); + if (ret < 0) + goto end_posix; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + posix_timer_entry__init(&pte[i]); + if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) + goto end_posix; + tte->posix[i] = &pte[i]; + i++; + } + + exit_code = 0; +end_posix: + free_posix_timers(proc_args); + return exit_code; +} diff --git a/criu/tty.c b/criu/tty.c index 199984ec08..ae23094b7b 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -22,6 +22,7 @@ #include "rst-malloc.h" #include "log.h" #include "common/list.h" +#include "util.h" #include "util-pie.h" #include "proc_parse.h" #include "file-ids.h" @@ -816,8 +817,26 @@ static int do_restore_tty_parms(void *arg, int fd, pid_t pid) * on termios too. Just to be on the safe side. */ - if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) - goto err; + if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) { + struct termios t; + + if (errno != EPERM) + goto err; + + memzero(&t, sizeof(t)); + if (ioctl(fd, TIOCGLCKTRMIOS, &t) < 0) { + pr_perror("Can't get tty locked params on %#x", p->tty_id); + goto err; + } + + /* + * The ioctl(TIOCSLCKTRMIOS) requires a CRIU process to be privileged + * in the init_user_ns, but if the current "termios_locked" value equal + * to the "termios_locked" value from the image, we can safely skip setting it. + */ + if (memcmp(&t, &p->tl, sizeof(struct termios)) != 0) + goto err; + } if ((p->has & HAS_TERMIOS) && ioctl(fd, TCSETS, &p->t) < 0) goto err; @@ -867,7 +886,7 @@ static int restore_tty_params(int fd, struct tty_info *info) } if (info->tie->has_uid && info->tie->has_gid) { - if (fchown(fd, info->tie->uid, info->tie->gid)) { + if (cr_fchown(fd, info->tie->uid, info->tie->gid)) { pr_perror("Can't setup uid %d gid %d on %#x", (int)info->tie->uid, (int)info->tie->gid, info->tfe->id); return -1; diff --git a/criu/tun.c b/criu/tun.c index 2e2cc32bf5..9d66f99296 100644 --- a/criu/tun.c +++ b/criu/tun.c @@ -455,27 +455,26 @@ int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **in TunLinkEntry tle = TUN_LINK_ENTRY__INIT; char spath[64]; char buf[64]; - int ret = 0; struct tun_link *tl; sprintf(spath, "class/net/%s/tun_flags", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.flags = strtol(buf, NULL, 0); sprintf(spath, "class/net/%s/owner", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.owner = strtol(buf, NULL, 10); sprintf(spath, "class/net/%s/group", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.group = strtol(buf, NULL, 10); - if (ret < 0) - return ret; - tl = get_tun_link_fd(nde->name, nde->peer_nsid, tle.flags); if (!tl) - return ret; + return -1; tle.vnethdr = tl->dmp.vnethdr; tle.sndbuf = tl->dmp.sndbuf; diff --git a/criu/util.c b/criu/util.c index db96cf938f..d2bc9a8657 100644 --- a/criu/util.c +++ b/criu/util.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -40,7 +39,6 @@ #include "mem.h" #include "namespaces.h" #include "criu-log.h" -#include "syscall.h" #include "util-caps.h" #include "clone-noasan.h" @@ -55,6 +53,7 @@ #include "action-scripts.h" #include "compel/infect-util.h" +#include #define VMA_OPT_LEN 128 @@ -519,12 +518,25 @@ int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned return cr_system_userns(in, out, err, cmd, argv, flags, -1); } -static int close_fds(int minfd) +int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) +{ + return syscall(__NR_close_range, fd, max_fd, flags); +} + +int close_fds(int minfd) { DIR *dir; struct dirent *de; int fd, ret, dfd; + if (kdat.has_close_range) { + if (cr_close_range(minfd, ~0, 0)) { + pr_perror("close_range failed"); + return -1; + } + return 0; + } + dir = opendir("/proc/self/fd"); if (dir == NULL) { pr_perror("Can't open /proc/self/fd"); @@ -662,40 +674,54 @@ int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], un return ret; } +struct child_args { + int *sk_pair; + int (*child_setup)(void); +}; + +static int child_func(void *_args) +{ + struct child_args *args = _args; + int sk, *sk_pair = args->sk_pair; + char c = 0; + + sk = sk_pair[1]; + close(sk_pair[0]); + + if (args->child_setup && args->child_setup() != 0) + exit(1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + exit(1); + } + + while (1) + sleep(1000); + exit(1); +} + pid_t fork_and_ptrace_attach(int (*child_setup)(void)) { pid_t pid; int sk_pair[2], sk; char c = 0; + struct child_args cargs = { + .sk_pair = sk_pair, + .child_setup = child_setup, + }; if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } - pid = fork(); + pid = clone_noasan(child_func, CLONE_UNTRACED | SIGCHLD, &cargs); if (pid < 0) { pr_perror("fork"); return -1; } - if (pid == 0) { - sk = sk_pair[1]; - close(sk_pair[0]); - - if (child_setup && child_setup() != 0) - exit(1); - - if (write(sk, &c, 1) != 1) { - pr_perror("write"); - exit(1); - } - - while (1) - sleep(1000); - exit(1); - } - sk = sk_pair[0]; close(sk_pair[1]); @@ -952,6 +978,89 @@ FILE *fopenat(int dirfd, char *path, char *cflags) return fdopen(tmp, cflags); } +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid) +{ + struct stat st; + + if (!fchown(fd, new_uid, new_gid)) + return 0; + if (errno != EPERM) + return -1; + + if (fstat(fd, &st) < 0) { + pr_perror("fstat() after fchown() for fd %d", fd); + goto out_eperm; + } + pr_debug("fstat(%d): uid %u gid %u\n", fd, st.st_uid, st.st_gid); + + if (new_uid != st.st_uid || new_gid != st.st_gid) + goto out_eperm; + + return 0; +out_eperm: + errno = EPERM; + return -1; +} + +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags) +{ + struct stat st; + int ret; + + if (fchownat(dirfd, path, new_uid, new_gid, flags) < 0 && errno != EPERM) { + int errno_cpy = errno; + pr_perror("Unable to change [%d]/%s ownership to (%d, %d)", + dirfd, path, new_uid, new_gid); + errno = errno_cpy; + return -1; + } + + if (fstatat(dirfd, path, &st, flags) < 0) { + int errno_cpy = errno; + pr_perror("Unable to stat [%d]/%s", dirfd, path); + errno = errno_cpy; + return -1; + } + + if (new_uid != st.st_uid || new_gid != st.st_gid) { + errno = EPERM; + pr_perror("Unable to change [%d]/%s ownership (%d, %d) to (%d, %d)", + dirfd, path, st.st_uid, st.st_gid, new_uid, new_gid); + errno = EPERM; + return -1; + } + + if (new_mode == st.st_mode) + return 0; + + if (S_ISLNK(st.st_mode)) { + /* + * We have no lchmod() function, and fchmod() will fail on + * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() + * function and flag AT_SYMLINK_NOFOLLOW described in + * man 2 fchmodat, but it is not currently implemented. %) + */ + return 0; + } + + if (!*path && flags & AT_EMPTY_PATH) + ret = fchmod(dirfd, new_mode); + else + ret = fchmodat(dirfd, path, new_mode, flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)); + if (ret < 0) { + int errno_cpy = errno; + pr_perror("Unable to set perms %o on [%d]/%s", new_mode, dirfd, path); + errno = errno_cpy; + } + + return ret; +} + +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode) +{ + return cr_fchpermat(fd, "", new_uid, new_gid, new_mode, AT_EMPTY_PATH); +} + void split(char *str, char token, char ***out, int *n) { int i; @@ -1072,20 +1181,6 @@ const char *ns_to_string(unsigned int ns) } } -void tcp_cork(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) - pr_perror("Unable to restore TCP_CORK (%d)", val); -} - -void tcp_nodelay(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) - pr_perror("Unable to restore TCP_NODELAY (%d)", val); -} - static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned short port) { memset(addr, 0, sizeof(*addr)); @@ -1460,23 +1555,78 @@ void print_stack_trace(pid_t pid) } #endif +int cr_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + int ret = syscall(__NR_fsconfig, fd, cmd, key, value, aux); + if (ret) + fsfd_dump_messages(fd); + return ret; +} + +int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + int ret = syscall(__NR_fsmount, fd, flags, attr_flags); + if (ret) + fsfd_dump_messages(fd); + return ret; +} + +void fsfd_dump_messages(int fd) +{ + char buf[4096]; + int err, n; + + err = errno; + + for (;;) { + n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) { + if (errno != ENODATA) + pr_perror("Unable to read from fs descriptor"); + break; + } + buf[n] = 0; + + switch (buf[0]) { + case 'w': + pr_warn("%s\n", buf); + break; + case 'i': + pr_info("%s\n", buf); + break; + case 'e': + /* fallthrough */ + default: + pr_err("%s\n", buf); + break; + } + } + + errno = err; +} + int mount_detached_fs(const char *fsname) { int fsfd, fd; - fsfd = sys_fsopen(fsname, 0); + fsfd = cr_fsopen(fsname, 0); if (fsfd < 0) { pr_perror("Unable to open the %s file system", fsname); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { pr_perror("Unable to create the %s file system", fsname); close(fsfd); return -1; } - fd = sys_fsmount(fsfd, 0, 0); + fd = cr_fsmount(fsfd, 0, 0); if (fd < 0) pr_perror("Unable to mount the %s file system", fsname); close(fsfd); @@ -1566,7 +1716,7 @@ static int is_iptables_nft(char *bin) goto err; } - ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, 0); + ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, CRS_CAN_FAIL); if (ret) { pr_err("%s -V failed\n", cmd[0]); goto err; @@ -1880,11 +2030,16 @@ uint64_t criu_run_id; void util_init(void) { - struct timespec tp; + struct stat statbuf; + + criu_run_id = getpid(); + if (!stat("/proc/self/ns/pid", &statbuf)) + criu_run_id |= (uint64_t)statbuf.st_ino << 32; + else if (errno != ENOENT) + pr_perror("Can't stat /proc/self/ns/pid - CRIU run id might not be unique"); - clock_gettime(CLOCK_MONOTONIC, &tp); - criu_run_id = ((uint64_t)getpid() << 32) + tp.tv_sec + tp.tv_nsec; compel_run_id = criu_run_id; + pr_info("CRIU run id = %#" PRIx64 "\n", criu_run_id); } /* diff --git a/flog/Makefile b/flog/Makefile new file mode 100644 index 0000000000..12255af719 --- /dev/null +++ b/flog/Makefile @@ -0,0 +1,29 @@ +OPTS=-ggdb3 -Wall -Werror +export OPTS + +CFLAGS += -iquote include +CFLAGS += -iquote flog/include +CFLAGS += -iquote flog/include/uapi + +include $(__nmk_dir)msg.mk + +$(eval $(call gen-built-in,src)) + +flog: + $(Q) $(MAKE) $(build)=$(obj)/src all +.PHONY: flog + +clean-flog: + $(call msg-gen, $@) + $(Q) $(MAKE) $(build)=$(obj)/src clean + $(Q) $(RM) built-in.o +.PHONY: clean-flog + +clean: clean-flog +mrproper: clean + +test: + ./tests/test00 + +all-y += flog + diff --git a/flog/built-in.S b/flog/built-in.S new file mode 100644 index 0000000000..26627d0544 --- /dev/null +++ b/flog/built-in.S @@ -0,0 +1,4 @@ +SECTIONS +{ + .rodata : { _rodata_start = . ; *(.rodata*) ; _rodata_end = . ;} +} diff --git a/flog/include/compiler.h b/flog/include/compiler.h new file mode 100644 index 0000000000..80264ec631 --- /dev/null +++ b/flog/include/compiler.h @@ -0,0 +1,77 @@ +#ifndef __COMPILER_H__ +#define __COMPILER_H__ + +/* + * Various definitions for success build, + * picked from various places, mostly from + * the linux kernel. + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)])) + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +#define __section(S) __attribute__((__section__(#S))) + +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef always_inline +#define always_inline __always_inline +#endif + +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +#define __aligned(x) __attribute__((aligned(x))) + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER) +#endif + +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) + +#define min(x, y) \ + ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void)(&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; \ + }) + +#define max(x, y) \ + ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void)(&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; \ + }) + +#define is_log2(v) (((v) & ((v)-1)) == 0) + +#endif /* __COMPILER_H__ */ diff --git a/flog/include/flog.h b/flog/include/flog.h new file mode 100644 index 0000000000..f00c20541f --- /dev/null +++ b/flog/include/flog.h @@ -0,0 +1,9 @@ +#ifndef __FLOG_H__ +#define __FLOG_H__ + +#include +#include + +#include "uapi/flog.h" + +#endif /* __FLOG_H__ */ diff --git a/flog/include/log.h b/flog/include/log.h new file mode 100644 index 0000000000..8aafe44b75 --- /dev/null +++ b/flog/include/log.h @@ -0,0 +1,17 @@ +#ifndef __LOG_H__ +#define __LOG_H__ + +#include + +#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) + +#if 1 +#define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#else +#define pr_debug(fmt, ...) +#endif + +#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) + +#endif /* __LOG_H__ */ diff --git a/flog/include/types.h b/flog/include/types.h new file mode 100644 index 0000000000..07c992968b --- /dev/null +++ b/flog/include/types.h @@ -0,0 +1,16 @@ +#ifndef __FLOG_TYPES_H__ +#define __FLOG_TYPES_H__ + +#include +#include + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +#endif /* __FLOG_TYPES_H__ */ diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h new file mode 100644 index 0000000000..5fb71b1053 --- /dev/null +++ b/flog/include/uapi/flog.h @@ -0,0 +1,139 @@ +#ifndef __UAPI_FLOG_H__ +#define __UAPI_FLOG_H__ + +#include +#include +#include + +/* + * We work with up to 32 arguments in macros here. + * If more provided -- behaviour is undefined. + */ + +/* + * By Laurent Deniau at https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s + */ +#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) +#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) + +#define FLOG_PP_ARG_N(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, \ + _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, N, ...) \ + N + +#define FLOG_PP_RSEQ_N() \ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, \ + 2, 1, 0 + +#define FLOG_GENMASK_0(N, x) 0 +#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) +#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) + +#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) +#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) +#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 + +#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) +#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) + +#define flog_genbit(ord, n, v, ...) \ + _Generic((v), \ + \ + /* Basic types */ \ + char: 0, \ + signed char: 0, \ + unsigned char: 0, \ + signed short int: 0, \ + unsigned short int: 0, \ + signed int: 0, \ + unsigned int: 0, \ + signed long: 0, \ + unsigned long: 0, \ + signed long long: 0, \ + unsigned long long: 0, \ + \ + /* Not used for a while */ \ + /* float: 12, */ \ + /* double: 13, */ \ + /* long double: 14, */ \ + \ + /* Basic pointers */ \ + char *: (1u << (ord - n - 1)), \ + signed char *: (1u << (ord - n - 1)), \ + unsigned char *: (1u << (ord - n - 1)), \ + signed short int *: 0, \ + unsigned short int *: 0, \ + signed int *: 0, \ + unsigned int *: 0, \ + signed long *: 0, \ + unsigned long *: 0, \ + signed long long *: 0, \ + unsigned long long *: 0, \ + void *: 0, \ + \ + /* Const basic pointers */ \ + const char *: (1u << (ord - n - 1)), \ + const signed char *: (1u << (ord - n - 1)), \ + const unsigned char *: (1u << (ord - n - 1)), \ + const signed short int *: 0, \ + const unsigned short int *: 0, \ + const signed int *: 0, \ + const unsigned int *: 0, \ + const signed long *: 0, \ + const unsigned long *: 0, \ + const signed long long *: 0, \ + const unsigned long long *: 0, \ + const void *: 0, \ + \ + /* Systypes and pointers */ \ + default: -1) + +typedef struct { + unsigned int magic; + unsigned int size; + unsigned int nargs; + unsigned int mask; + long fmt; + long args[0]; +} flog_msg_t; + +extern int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...); +void flog_decode_msg(int fdout, const char *format, ...); +extern int flog_decode_all(int fdin, int fdout); + +#define flog_encode(fdout, fmt, ...) \ + flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) + +int flog_map_buf(int fdout); +int flog_close(int fdout); + +#endif /* __UAPI_FLOG_H__ */ diff --git a/flog/include/util.h b/flog/include/util.h new file mode 100644 index 0000000000..7b1edb6885 --- /dev/null +++ b/flog/include/util.h @@ -0,0 +1,41 @@ +#ifndef __UTIL_H__ +#define __UTIL_H__ + +#include +#include + +#include "log.h" +#include "types.h" + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) \ + do { \ + if (p) \ + free(p); \ + } while (0) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -ENOMEM; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +#endif /* __UTIL_H__ */ diff --git a/flog/src/Makefile b/flog/src/Makefile new file mode 100644 index 0000000000..ee73ea7252 --- /dev/null +++ b/flog/src/Makefile @@ -0,0 +1,5 @@ +ccflags-y += -DCONFIG_X86_64 -iquote ./include $(OPTS) +ldflags-y += -r + +#obj-y += main.o +obj-y += flog.o diff --git a/flog/src/flog.c b/flog/src/flog.c new file mode 100644 index 0000000000..d7660f18d8 --- /dev/null +++ b/flog/src/flog.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include + +//#include + +#include "uapi/flog.h" +#include "util.h" + +#define MAGIC 0xABCDABCD + +#define BUF_SIZE (1 << 20) +static char _mbuf[BUF_SIZE]; +static char *mbuf = _mbuf; +static char *fbuf; +static uint64_t fsize; +static uint64_t mbuf_size = sizeof(_mbuf); + +/*int flog_decode_all(int fdin, int fdout) +{ + flog_msg_t *m = (void *)mbuf; + ffi_type *args[34] = { + [0] = &ffi_type_sint, + [1] = &ffi_type_pointer, + [2 ... 33] = &ffi_type_slong + }; + void *values[34]; + ffi_cif cif; + ffi_arg rc; + size_t i, ret; + char *fmt; + + values[0] = (void *)&fdout; + + while (1) { + ret = read(fdin, mbuf, sizeof(m)); + if (ret == 0) + break; + if (ret < 0) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + if (m->magic != MAGIC) { + fprintf(stderr, "The log file was not properly closed\n"); + break; + } + ret = m->size - sizeof(m); + if (m->size > mbuf_size) { + fprintf(stderr, "The buffer is too small"); + return -1; + } + if (read(fdin, mbuf + sizeof(m), ret) != ret) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + + fmt = mbuf + m->fmt; + values[1] = &fmt; + + for (i = 0; i < m->nargs; i++) { + values[i + 2] = (void *)&m->args[i]; + if (m->mask & (1u << i)) { + m->args[i] = (long)(mbuf + m->args[i]); + } + } + + if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, m->nargs + 2, + &ffi_type_sint, args) == FFI_OK) + ffi_call(&cif, FFI_FN(dprintf), &rc, values); + } + return 0; +}*/ + +static int flog_enqueue(flog_msg_t *m) +{ + if (write(1, m, m->size) != m->size) { + fprintf(stderr, "Unable to write a message\n"); + return -1; + } + return 0; +} + +/*extern char *rodata_start; +extern char *rodata_end; +*/ +/* Pre-allocate a buffer in a file and map it into memory. */ +int flog_map_buf(int fdout) +{ + uint64_t off = 0; + void *addr; + + /* + * Two buffers are mmapped into memory. A new one is mapped when a first + * one is completely filled. + */ + if (fbuf && (mbuf - fbuf < BUF_SIZE)) + return 0; + + if (fbuf) { + if (munmap(fbuf, BUF_SIZE * 2)) { + fprintf(stderr, "Unable to unmap a buffer: %m"); + return -1; + } + off = mbuf - fbuf - BUF_SIZE; + fbuf = NULL; + } + + if (fsize == 0) + fsize += BUF_SIZE; + fsize += BUF_SIZE; + + if (ftruncate(fdout, fsize)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + + if (!fbuf) + addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, MAP_FILE | MAP_SHARED, fdout, + fsize - 2 * BUF_SIZE); + else + addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, BUF_SIZE * 2, MREMAP_FIXED, fbuf); + if (addr == MAP_FAILED) { + fprintf(stderr, "Unable to map a buffer: %m"); + return -1; + } + + fbuf = addr; + mbuf = fbuf + off; + mbuf_size = 2 * BUF_SIZE; + + return 0; +} + +int flog_close(int fdout) +{ + if (mbuf == _mbuf) + return 0; + + munmap(fbuf, BUF_SIZE * 2); + + if (ftruncate(fdout, fsize - 2 * BUF_SIZE + mbuf - fbuf)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + return 0; +} + +int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...) +{ + flog_msg_t *m; + va_list argptr; + char *str_start, *p; + size_t i; + + if (mbuf != _mbuf && flog_map_buf(fdout)) + return -1; + + m = (void *)mbuf; + + m->nargs = nargs; + m->mask = mask; + + str_start = (void *)m->args + sizeof(m->args[0]) * nargs; + p = memccpy(str_start, format, 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + return -1; + } + m->fmt = str_start - mbuf; + str_start = p; + + va_start(argptr, format); + for (i = 0; i < nargs; i++) { + m->args[i] = (long)va_arg(argptr, long); + /* + * If we got a string, we should either + * reference it when in rodata, or make + * a copy (FIXME implement rodata refs). + */ + if (mask & (1u << i)) { + p = memccpy(str_start, (void *)m->args[i], 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + va_end(argptr); + return -1; + } + m->args[i] = str_start - mbuf; + str_start = p; + } + } + va_end(argptr); + m->size = str_start - mbuf; + + /* + * A magic is required to know where we stop writing into a log file, + * if it was not properly closed. The file is mapped into memory, so a + * space in the file is allocated in advance and at the end it can have + * some unused tail. + */ + m->magic = MAGIC; + + m->size = roundup(m->size, 8); + if (mbuf == _mbuf) { + if (flog_enqueue(m)) + return -1; + } else { + mbuf += m->size; + mbuf_size -= m->size; + } + return 0; +} diff --git a/flog/src/main.c b/flog/src/main.c new file mode 100644 index 0000000000..e027917c68 --- /dev/null +++ b/flog/src/main.c @@ -0,0 +1,158 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "flog.h" + +extern char _rodata_start, _rodata_end; +char *rodata_start = &_rodata_start; +char *rodata_end = &_rodata_end; + +enum { + MODE_BINARY, + MODE_FPRINTF, + MODE_SPRINTF, + MODE_DPRINTF, +}; + +int main(int argc, char *argv[]) +{ + static const char str1[] = "String1 String1"; + static const char str2[] = "string2 string2 string2"; + int fdout = STDOUT_FILENO; + bool use_decoder = false; + int mode = MODE_BINARY; + size_t niter = 100; + int opt, idx; + size_t i; + + static const char short_opts[] = "m:o:di:h"; + static struct option long_opts[] = { + { "mode", required_argument, 0, 'm' }, { "output", required_argument, 0, 'o' }, + { "decode", no_argument, 0, 'd' }, { "iter", required_argument, 0, 'i' }, + { "help", no_argument, 0, 'h' }, {}, + }; + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + + switch (opt) { + case 'm': + if (strcmp(optarg, "binary") == 0) { + mode = MODE_BINARY; + } else if (strcmp(optarg, "fprintf") == 0) { + mode = MODE_FPRINTF; + } else if (strcmp(optarg, "sprintf") == 0) { + mode = MODE_SPRINTF; + } else if (strcmp(optarg, "dprintf") == 0) { + mode = MODE_DPRINTF; + } else + goto usage; + break; + case 'o': + if (strcmp(optarg, "stdout") == 0) { + fdout = fileno(stdout); + } else if (strcmp(optarg, "stderr") == 0) { + fdout = fileno(stderr); + } else { + fdout = open(optarg, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fdout < 0) { + fprintf(stderr, "Can't open %s: %s\n", optarg, strerror(errno)); + exit(1); + } + } + break; + case 'i': + niter = atoi(optarg); + break; + case 'd': + use_decoder = true; + break; + case 'h': + default: + goto usage; + } + } + + switch (mode) { + case MODE_BINARY: + if (use_decoder) + return flog_decode_all(STDIN_FILENO, fdout); + + if (fdout != STDOUT_FILENO && flog_map_buf(fdout)) + return 1; + for (i = 0; i < niter; i++) + if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, + (short)2, (unsigned long)2)) + return 1; + if (flog_close(fdout)) + return 1; + break; + case MODE_DPRINTF: { + for (i = 0; i < niter; i++) { + dprintf(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + case MODE_FPRINTF: { + FILE *f = fdopen(fdout, "w"); + + for (i = 0; i < niter; i++) { + fprintf(f, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + fflush(f); + } + fclose(f); + break; + } + case MODE_SPRINTF: { + static char buf[4096]; + + for (i = 0; i < niter; i++) { + sprintf(buf, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + default: + return 1; + } + + return 0; +usage: + fprintf(stderr, "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" + "\n" + + "Examples:\n" + "\n" + + " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" + "\n" + " flog -m dprintf -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode without processing (queue messages only)\n" + "\n" + " flog -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after\n" + "\n" + " flog -i 100000 -d\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after, writing results into 'out' file\n" + "\n" + " flog -i 100000 -d -o out\n" + "\n"); + return 1; +} diff --git a/flog/tests/test00 b/flog/tests/test00 new file mode 100755 index 0000000000..a7937e4a18 --- /dev/null +++ b/flog/tests/test00 @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e -x + +echo Map a log file into memory +time ./flog run -i 1000000 -o /tmp/flog.raw.map +echo Write into a log file +time ./flog run -i 1000000 > /tmp/flog.raw +echo Use fprintf +time ./flog run -m fprintf -i 1000000 -o /tmp/flog.fprintf.txt +echo Use dprintf +time ./flog run -m dprintf -i 1000000 -o /tmp/flog.dprintf.txt +echo Use sprintf +time ./flog run -m sprintf -i 1000000 + +time ./flog run -d < /tmp/flog.raw > /tmp/flog.raw.txt +cmp /tmp/flog.raw.txt /tmp/flog.fprintf.txt + +time ./flog run -d < /tmp/flog.raw.map > /tmp/flog.raw.map.txt +cmp /tmp/flog.raw.map.txt /tmp/flog.fprintf.txt + +cmp /tmp/flog.dprintf.txt /tmp/flog.fprintf.txt diff --git a/images/Makefile b/images/Makefile index 004e22ec3f..855d894da6 100644 --- a/images/Makefile +++ b/images/Makefile @@ -2,6 +2,7 @@ proto-obj-y += stats.o proto-obj-y += core.o proto-obj-y += core-x86.o proto-obj-y += core-mips.o +proto-obj-y += core-loongarch64.o proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o @@ -72,6 +73,7 @@ proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o proto-obj-y += rseq.o +proto-obj-y += pidfd.o CFLAGS += -iquote $(obj)/ diff --git a/images/core-loongarch64.proto b/images/core-loongarch64.proto new file mode 100755 index 0000000000..8258f006ea --- /dev/null +++ b/images/core-loongarch64.proto @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +message user_loongarch64_gpregs_entry { + repeated uint64 regs = 1; + required uint64 pc = 2; +} + +message user_loongarch64_fpregs_entry { + repeated uint64 regs = 1; + required uint64 fcc = 2; + required uint32 fcsr = 3; +} + +message thread_info_loongarch64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_loongarch64_gpregs_entry gpregs = 3[(criu).hex = true]; + required user_loongarch64_fpregs_entry fpregs = 4[(criu).hex = true]; +} diff --git a/images/core-x86.proto b/images/core-x86.proto index 815cf21ff8..762418d73b 100644 --- a/images/core-x86.proto +++ b/images/core-x86.proto @@ -41,6 +41,11 @@ message user_x86_regs_entry { optional user_x86_regs_mode mode = 28 [default = NATIVE]; } +message user_x86_cet_entry { + required uint64 cet = 1[(criu).hex = true]; + required uint64 ssp = 2[(criu).hex = true]; +} + message user_x86_xsave_entry { /* standard xsave features */ required uint64 xstate_bv = 1; @@ -60,6 +65,9 @@ message user_x86_xsave_entry { /* Protected keys */ repeated uint32 pkru = 8; + /* CET */ + optional user_x86_cet_entry cet = 9; + /* * Processor trace (PT) and hardware duty cycling (HDC) * are supervisor state components and only managed by diff --git a/images/core.proto b/images/core.proto index eddd1dc555..5b07b5c448 100644 --- a/images/core.proto +++ b/images/core.proto @@ -8,6 +8,7 @@ import "core-aarch64.proto"; import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; +import "core-loongarch64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -63,6 +64,8 @@ message task_core_entry { optional uint64 blk_sigset_extended = 20[(criu).hex = true]; optional uint32 stop_signo = 21; + + optional uint32 membarrier_registration_mask = 22 [(criu).hex = true]; } message task_kobj_ids_entry { @@ -122,6 +125,7 @@ message core_entry { PPC64 = 4; S390 = 5; MIPS = 6; + LOONGARCH64 = 7; } required march mtype = 1; @@ -131,6 +135,7 @@ message core_entry { optional thread_info_ppc64 ti_ppc64 = 9; optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; + optional thread_info_loongarch64 ti_loongarch64 = 12; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; diff --git a/images/creds.proto b/images/creds.proto index 6228f7fcbb..220ed38587 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -24,4 +24,5 @@ message creds_entry { optional string lsm_profile = 15; optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; + optional uint32 no_new_privs = 18; } diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c11860..32ec13cf48 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -17,6 +17,7 @@ import "ext-file.proto"; import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; +import "pidfd.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + PIDFD = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -78,4 +80,5 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; + optional pidfd_entry pidfd = 22; } diff --git a/images/inventory.proto b/images/inventory.proto index a735bad1d0..7f655031bc 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -10,6 +10,13 @@ enum lsmtype { APPARMOR = 2; } +// It is not possible to distinguish between an empty repeated field +// and unset repeated field. To solve this problem and provide backwards +// compabibility, we use the 'plugins_entry' message. +message plugins_entry { + repeated string plugins = 12; +}; + message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; @@ -21,4 +28,5 @@ message inventory_entry { optional uint32 pre_dump_mode = 9; optional bool tcp_close = 10; optional uint32 network_lock_method = 11; + optional plugins_entry plugins_entry = 12; } diff --git a/images/memfd.proto b/images/memfd.proto index 0e625416a7..bb0be4a6fc 100644 --- a/images/memfd.proto +++ b/images/memfd.proto @@ -22,4 +22,5 @@ message memfd_inode_entry { required uint32 seals = 6 [(criu).flags = "seals.flags"]; required uint64 inode_id = 7; optional uint32 hugetlb_flag = 8; + optional uint32 mode = 9; }; diff --git a/images/pidfd.proto b/images/pidfd.proto new file mode 100644 index 0000000000..a9da3e4543 --- /dev/null +++ b/images/pidfd.proto @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "fown.proto"; + +message pidfd_entry { + required uint32 id = 1; + required uint32 ino = 2; + required uint32 flags = 3; + required int32 nspid = 4; + required fown_entry fown = 5; +} diff --git a/images/rpc.proto b/images/rpc.proto index afd2c7b43f..1a4722a9ce 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -52,6 +52,7 @@ enum criu_cg_mode { enum criu_network_lock_method { IPTABLES = 1; NFTABLES = 2; + SKIP = 3; }; enum criu_pre_dump_mode { @@ -60,7 +61,8 @@ enum criu_pre_dump_mode { }; message criu_opts { - required int32 images_dir_fd = 1; + required int32 images_dir_fd = 1 [default = -1]; + optional string images_dir = 68; /* used only if images_dir_fd == -1 */ optional int32 pid = 2; /* if not set on dump, will dump requesting process */ optional bool leave_running = 3; @@ -140,6 +142,9 @@ message criu_opts { optional bool mntns_compat_mode = 65; optional bool skip_file_rwx_check = 66; optional bool unprivileged = 67; + optional bool leave_stopped = 69; + optional bool display_stats = 70; + optional bool log_to_stderr = 71; /* optional bool check_mounts = 128; */ } diff --git a/images/sk-inet.proto b/images/sk-inet.proto index ee1f0ae410..2c709e0181 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -5,6 +5,7 @@ syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; +import "tcp-stream.proto"; message ip_opts_raw_entry { optional bool hdrincl = 1; @@ -19,6 +20,8 @@ message ip_opts_entry { optional ip_opts_raw_entry raw = 4; optional bool pktinfo = 5; + optional uint32 tos = 6; + optional uint32 ttl = 7; } message inet_sk_entry { @@ -54,4 +57,5 @@ message inet_sk_entry { optional string ifname = 17; optional uint32 ns_id = 18; optional sk_shutdown shutdown = 19; + optional tcp_opts_entry tcp_opts = 20; } diff --git a/images/sk-opts.proto b/images/sk-opts.proto index 1d24d47cc7..2f9d4e5c3c 100644 --- a/images/sk-opts.proto +++ b/images/sk-opts.proto @@ -26,9 +26,12 @@ message sk_opts_entry { optional bool so_reuseport = 17; optional bool so_broadcast = 18; optional bool so_keepalive = 19; + + /* These three are deprecated, use tcp_opts_entry instead */ optional uint32 tcp_keepcnt = 20; optional uint32 tcp_keepidle = 21; optional uint32 tcp_keepintvl = 22; + optional uint32 so_oobinline = 23; optional uint32 so_linger = 24; diff --git a/images/tcp-stream.proto b/images/tcp-stream.proto index c2244ba3bf..3d834159fb 100644 --- a/images/tcp-stream.proto +++ b/images/tcp-stream.proto @@ -4,6 +4,14 @@ syntax = "proto2"; import "opts.proto"; +message tcp_opts_entry { + optional bool cork = 1; + optional bool nodelay = 2; + optional uint32 keepcnt = 3; + optional uint32 keepidle = 4; + optional uint32 keepintvl = 5; +} + message tcp_stream_entry { required uint32 inq_len = 1; required uint32 inq_seq = 2; @@ -16,6 +24,7 @@ message tcp_stream_entry { optional uint32 rcv_wscale = 8; optional uint32 timestamp = 9; + /* These two are deprecated, use tcp_opts_entry instead */ optional bool cork = 10; optional bool nodelay = 11; diff --git a/include/common/arch/aarch64/asm/page.h b/include/common/arch/aarch64/asm/page.h index 90670d1265..4555debbdc 100644 --- a/include/common/arch/aarch64/asm/page.h +++ b/include/common/arch/aarch64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/loongarch64/asm/atomic.h b/include/common/arch/loongarch64/asm/atomic.h new file mode 100644 index 0000000000..9017254397 --- /dev/null +++ b/include/common/arch/loongarch64/asm/atomic.h @@ -0,0 +1,62 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +#include +#include "common/compiler.h" + +typedef struct { + int counter; +} atomic_t; + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(v->counter), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add(1, v) +#define atomic_inc_return(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(ptr->counter) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/loongarch64/asm/bitops.h b/include/common/arch/loongarch64/asm/bitops.h new file mode 100644 index 0000000000..170e4f7369 --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitops.h @@ -0,0 +1,24 @@ +#ifndef _LINUX_BITOPS_H +#define _LINUX_BITOPS_H +#include "common/asm-generic/bitops.h" + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ + +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((1UL << ((nr) / BITS_PER_LONG)) - 1) +static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long res, mask; + mask = BIT_MASK(nr); + asm volatile("amor_db.d %0, %2, %1" : "=&r"(res), "+ZB"(addr[BIT_WORD(nr)]) : "r"(mask) : "memory"); + return (res & mask) != 0; +} + +#endif diff --git a/include/common/arch/loongarch64/asm/bitsperlong.h b/include/common/arch/loongarch64/asm/bitsperlong.h new file mode 100644 index 0000000000..13d06a384e --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG _LOONGARCH_SZLONG + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/loongarch64/asm/linkage.h b/include/common/arch/loongarch64/asm/linkage.h new file mode 100644 index 0000000000..448acc29fc --- /dev/null +++ b/include/common/arch/loongarch64/asm/linkage.h @@ -0,0 +1,19 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#define __ALIGN .align 2 +#define __ALIGN_STR ".align 2" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + __ALIGN; \ + .type name, @function; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/loongarch64/asm/page.h b/include/common/arch/loongarch64/asm/page.h new file mode 100644 index 0000000000..4fcdb64dc1 --- /dev/null +++ b/include/common/arch/loongarch64/asm/page.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned __page_size; +static unsigned __page_shift; + +static inline unsigned long page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SIZE page_size() +#define PAGE_SHIFT page_shift() +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#else /* CR_NOGLIBC */ + +extern unsigned long page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/include/common/arch/mips/asm/page.h b/include/common/arch/mips/asm/page.h index 25bdbc1412..4fcdb64dc1 100644 --- a/include/common/arch/mips/asm/page.h +++ b/include/common/arch/mips/asm/page.h @@ -10,7 +10,7 @@ static unsigned __page_size; static unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -31,7 +31,7 @@ static inline unsigned page_shift(void) #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/ppc64/asm/page.h b/include/common/arch/ppc64/asm/page.h index a1ff6718ad..2b0c0e5042 100644 --- a/include/common/arch/ppc64/asm/page.h +++ b/include/common/arch/ppc64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/compiler.h b/include/common/compiler.h index bd3de01df1..3e66709f92 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -30,6 +30,17 @@ #define __always_unused __attribute__((unused)) #define __must_check __attribute__((__warn_unused_result__)) +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +/* Not supported by clang */ +#if __has_attribute(__externally_visible__) +#define __visible __attribute__((__externally_visible__)) +#else +#define __visible +#endif + #define __section(S) __attribute__((__section__(#S))) #ifndef __always_inline @@ -47,7 +58,9 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __aligned #define __aligned(x) __attribute__((aligned(x))) +#endif /* * Macro to define stack alignment. @@ -76,6 +89,7 @@ #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) #define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) +#define ALIGN_DOWN(x, a) ALIGN((x) - ((a) - 1), (a)) #define min(x, y) \ ({ \ diff --git a/include/common/scm.h b/include/common/scm.h index bcb198882b..5b6f78a8bd 100644 --- a/include/common/scm.h +++ b/include/common/scm.h @@ -11,7 +11,7 @@ * Because of kernel doing kmalloc for user data passed * in SCM messages, and there is kernel's SCM_MAX_FD as a limit * for descriptors passed at once we're trying to reduce - * the pressue on kernel memory manager and use predefined + * the pressure on kernel memory manager and use predefined * known to work well size of the message buffer. */ #define CR_SCM_MSG_SIZE (1024) diff --git a/lib/.gitignore b/lib/.gitignore new file mode 100644 index 0000000000..a10181b800 --- /dev/null +++ b/lib/.gitignore @@ -0,0 +1 @@ +pycriu.egg-info/ diff --git a/lib/Makefile b/lib/Makefile index ff540fb75d..4b8a6cbb83 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,23 +25,23 @@ lib-a: lib/c/$(CRIU_A) # # Python bindings. -lib/py/Makefile: ; -lib/py/%: .FORCE +lib/pycriu/Makefile: ; +lib/pycriu/%: .FORCE $(call msg-gen, $@) - $(Q) $(MAKE) $(build)=lib/py $@ + $(Q) $(MAKE) $(build)=lib/pycriu $@ lib-py: - $(Q) $(MAKE) $(build)=lib/py all + $(Q) $(MAKE) $(build)=lib/pycriu all .PHONY: lib-py clean-lib: $(Q) $(MAKE) $(build)=lib/c clean - $(Q) $(MAKE) $(build)=lib/py clean + $(Q) $(MAKE) $(build)=lib/pycriu clean .PHONY: clean-lib clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc mrproper: clean -install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in +install: lib-c lib-a lib-py lib/c/criu.pc.in $(E) " INSTALL " lib $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 755 lib/c/$(CRIU_SO) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) @@ -54,9 +54,11 @@ install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig -ifeq ($(PYTHON),python3) - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install --upgrade --force-reinstall --prefix=$(DESTDIR)$(PREFIX) ./crit +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " INSTALL " pycriu + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./lib +else + $(E) " SKIP INSTALL pycriu" endif .PHONY: install @@ -69,8 +71,10 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) -ifeq ($(PYTHON),python3) - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " UNINSTALL" pycriu + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu +else + $(E) " SKIP UNINSTALL pycriu" endif .PHONY: uninstall diff --git a/lib/c/criu.c b/lib/c/criu.c index fc8159999c..7f766db857 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -1868,7 +1868,7 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { opts->rpc->has_network_lock = true; - if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES) { + if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } @@ -2030,3 +2030,14 @@ int criu_feature_check(struct criu_feature_check *features, size_t size) { return criu_local_feature_check(global_opts, features, size); } + +void criu_local_set_empty_ns(criu_opts *opts, int namespaces) +{ + opts->rpc->has_empty_ns = true; + opts->rpc->empty_ns = namespaces; +} + +void criu_set_empty_ns(int namespaces) +{ + criu_local_set_empty_ns(global_opts, namespaces); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index 28a083d88d..c1c6078698 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -50,6 +50,7 @@ enum criu_cg_mode { enum criu_network_lock_method { CRIU_NETWORK_LOCK_IPTABLES = 1, CRIU_NETWORK_LOCK_NFTABLES = 2, + CRIU_NETWORK_LOCK_SKIP = 3, }; enum criu_pre_dump_mode { CRIU_PRE_DUMP_SPLICE = 1, CRIU_PRE_DUMP_READ = 2 }; @@ -322,6 +323,9 @@ struct criu_feature_check { int criu_feature_check(struct criu_feature_check *features, size_t size); int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size); +void criu_local_set_empty_ns(criu_opts *opts, int namespaces); +void criu_set_empty_ns(int namespaces); + #ifdef __GNUG__ } #endif diff --git a/lib/py/.gitignore b/lib/py/.gitignore deleted file mode 100644 index d3090fca32..0000000000 --- a/lib/py/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*_pb2.py -*.pyc diff --git a/lib/pycriu/.gitignore b/lib/pycriu/.gitignore new file mode 100644 index 0000000000..111642787a --- /dev/null +++ b/lib/pycriu/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +*_pb2.py +*.pyc +version.py diff --git a/lib/py/Makefile b/lib/pycriu/Makefile similarity index 66% rename from lib/py/Makefile rename to lib/pycriu/Makefile index 691b6bdd33..5ce9bc8f7e 100644 --- a/lib/py/Makefile +++ b/lib/pycriu/Makefile @@ -1,4 +1,4 @@ -all-y += libpy-images rpc_pb2.py +all-y += libpy-images rpc_pb2.py version.py $(obj)/images/Makefile: ; $(obj)/images/%: .FORCE @@ -11,7 +11,10 @@ libpy-images: rpc_pb2.py: $(Q) protoc -I=images/ --python_out=$(obj) images/$(@:_pb2.py=.proto) -cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc) +version.py: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $(obj)/$@ + +cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc version.py) clean-lib-py: $(Q) $(MAKE) $(build)=$(obj)/images clean diff --git a/lib/py/__init__.py b/lib/pycriu/__init__.py similarity index 68% rename from lib/py/__init__.py rename to lib/pycriu/__init__.py index 96b3e9526c..2abcf029de 100644 --- a/lib/py/__init__.py +++ b/lib/pycriu/__init__.py @@ -1,3 +1,4 @@ from . import rpc_pb2 as rpc from . import images from .criu import * +from .version import __version__ \ No newline at end of file diff --git a/lib/py/criu.py b/lib/pycriu/criu.py similarity index 100% rename from lib/py/criu.py rename to lib/pycriu/criu.py diff --git a/lib/py/images/.gitignore b/lib/pycriu/images/.gitignore similarity index 100% rename from lib/py/images/.gitignore rename to lib/pycriu/images/.gitignore diff --git a/lib/py/images/Makefile b/lib/pycriu/images/Makefile similarity index 100% rename from lib/py/images/Makefile rename to lib/pycriu/images/Makefile diff --git a/lib/py/images/__init__.py b/lib/pycriu/images/__init__.py similarity index 100% rename from lib/py/images/__init__.py rename to lib/pycriu/images/__init__.py diff --git a/lib/py/images/images.py b/lib/pycriu/images/images.py similarity index 98% rename from lib/py/images/images.py rename to lib/pycriu/images/images.py index a1d76e7cf2..9db506e1ee 100644 --- a/lib/py/images/images.py +++ b/lib/pycriu/images/images.py @@ -42,7 +42,6 @@ import struct import os import array -import sys from . import magic from . import pb @@ -71,18 +70,12 @@ def __init__(self, magic): def decode_base64_data(data): """A helper function to decode base64 data.""" - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(data)) - else: - return base64.decodebytes(data) + return base64.decodebytes(str.encode(data)) def write_base64_data(f, data): """A helper function to write base64 encoded data to a file.""" - if (sys.version_info > (3, 0)): - f.write(base64.decodebytes(str.encode(data))) - else: - f.write(base64.decodebytes(data)) + f.write(base64.decodebytes(str.encode(data))) # Generic class to handle loading/dumping criu images entries from/to bin diff --git a/lib/py/images/pb2dict.py b/lib/pycriu/images/pb2dict.py similarity index 96% rename from lib/py/images/pb2dict.py rename to lib/pycriu/images/pb2dict.py index 9d581c3750..e3dd95ac0a 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -3,7 +3,6 @@ import os import quopri import socket -import sys from ipaddress import IPv4Address, IPv6Address, ip_address from google.protobuf.descriptor import FieldDescriptor as FD @@ -103,6 +102,8 @@ def _custom_conv(field): ('VMA_AREA_SOCKET', 1 << 11), ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), + ('VMA_AREA_MEMFD', 1 << 14), + ('VMA_AREA_SHSTK', 1 << 15), ('VMA_UNSUPP', 1 << 31), ] @@ -247,17 +248,11 @@ def encode_dev(field, value): def encode_base64(value): - if (sys.version_info > (3, 0)): - return base64.encodebytes(value).decode() - else: - return base64.encodebytes(value) + return base64.encodebytes(value).decode() def decode_base64(value): - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(value)) - else: - return base64.decodebytes(value) + return base64.decodebytes(str.encode(value)) def encode_unix(value): @@ -309,7 +304,7 @@ def _pb2dict_cast(field, value, pretty=False, is_hex=False): return field.enum_type.values_by_number.get(value, None).name elif field.type in _basic_cast: cast = _basic_cast[field.type] - if pretty and (cast == int): + if pretty and cast is int: if is_hex: # Fields that have (criu).hex = true option set # should be stored in hex string format. @@ -364,21 +359,24 @@ def pb2dict(pb, pretty=False, is_hex=False): else: d_val = _pb2dict_cast(field, value, pretty, is_hex) - d[field.name] = d_val.decode() if type(d_val) == bytes else d_val + try: + d[field.name] = d_val.decode() + except (UnicodeDecodeError, AttributeError): + d[field.name] = d_val return d def _dict2pb_cast(field, value): # Not considering TYPE_MESSAGE here, as repeated # and non-repeated messages need special treatment - # in this case, and are hadled separately. + # in this case, and are handled separately. if field.type == FD.TYPE_BYTES: return get_bytes_dec(field)(value) elif field.type == FD.TYPE_ENUM: return field.enum_type.values_by_name.get(value, None).number elif field.type in _basic_cast: cast = _basic_cast[field.type] - if (cast == int) and is_string(value): + if cast is int and is_string(value): if _marked_as_dev(field): return encode_dev(field, value) diff --git a/lib/pyproject.toml b/lib/pyproject.toml new file mode 100644 index 0000000000..8eb4b7084d --- /dev/null +++ b/lib/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools", "protobuf<4.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pycriu" +description = "Python bindings for CRIU" +authors = [ + {name = "CRIU team", email = "criu@openvz.org"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[tool.setuptools] +packages = ["pycriu", "pycriu.images"] + +[tool.setuptools.dynamic] +version = {attr = "pycriu.__version__"} diff --git a/lib/setup.cfg b/lib/setup.cfg new file mode 100644 index 0000000000..23ee48dd5b --- /dev/null +++ b/lib/setup.cfg @@ -0,0 +1,16 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = pycriu +description = Python bindings for CRIU +author = CRIU team +author_email = criu@openvz.org +license = GPLv2 +version = attr: pycriu.__version__ + +[options] +packages = find: +python_requires = >=3.6 diff --git a/coredump/coredump-python3 b/lib/setup.py old mode 100755 new mode 100644 similarity index 55% rename from coredump/coredump-python3 rename to lib/setup.py index 3032dbadf1..618ac1de48 --- a/coredump/coredump-python3 +++ b/lib/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 +import setuptools -import coredump if __name__ == '__main__': - coredump.main() + setuptools.setup() diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 64a923d388..a20d1d1639 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -15,8 +15,7 @@ DEPS_NOK := ; __nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk -CC := gcc -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" +PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu ifeq ($(CONFIG_AMDGPU),y) @@ -28,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc-c --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_topology.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: @@ -54,7 +53,7 @@ install: ifeq ($(CONFIG_AMDGPU),y) $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) + $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) endif .PHONY: install diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 6809ec8b9a..1078eafe6f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -263,7 +263,7 @@ ROCm | Radeon Open Compute Platform Thunk | User-mode API interface to interact with amdgpu.ko KFD | AMD Kernel Fusion Driver Mesa | Open source OpenGL implementation -GTT | Graphis Translation Table, also used to denote kernel-managed system memory for GPU access +GTT | Graphics Translation Table, also used to denote kernel-managed system memory for GPU access VRAM | Video RAM BO | Buffer Object HMM | Heterogeneous Memory Management diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 0a55e34a2b..96c0861628 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -30,55 +30,14 @@ #include "files.h" #include "common/list.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" #include "img-streamer.h" #include "image.h" #include "cr_options.h" -#define AMDGPU_KFD_DEVICE "/dev/kfd" -#define PROCPIDMEM "/proc/%d/mem" -#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" -#define HSAKMT_SHM "/hsakmt_shared_mem" -#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" -#define HSAKMT_SEM "hsakmt_semaphore" - -#define KFD_IOCTL_MAJOR_VERSION 1 -#define MIN_KFD_IOCTL_MINOR_VERSION 8 - -#define IMG_KFD_FILE "amdgpu-kfd-%d.img" -#define IMG_RENDERD_FILE "amdgpu-renderD-%d.img" -#define IMG_PAGES_FILE "amdgpu-pages-%d-%04x.img" - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef LOG_PREFIX -#undef LOG_PREFIX -#endif -#define LOG_PREFIX "amdgpu_plugin: " - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif - -#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) - -#define SDMA_OPCODE_COPY 1 -#define SDMA_COPY_SUB_OPCODE_LINEAR 0 -#define SDMA_NOP 0 -#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) - -enum sdma_op_type { - SDMA_OP_VRAM_READ, - SDMA_OP_VRAM_WRITE, -}; - struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -89,139 +48,23 @@ struct vma_metadata { }; /************************************ Global Variables ********************************************/ -struct tp_system src_topology; -struct tp_system dest_topology; - -struct device_maps checkpoint_maps; -struct device_maps restore_maps; - -extern int fd_next; - -static LIST_HEAD(update_vma_info_list); - -extern bool kfd_fw_version_check; -extern bool kfd_sdma_fw_version_check; -extern bool kfd_caches_count_check; -extern bool kfd_num_gws_check; -extern bool kfd_vram_size_check; -extern bool kfd_numa_check; -extern bool kfd_capability_check; - -/**************************************************************************************************/ - -int write_fp(FILE *fp, const void *buf, const size_t buf_len) -{ - size_t len_write; - - len_write = fwrite(buf, 1, buf_len, fp); - if (len_write != buf_len) { - pr_perror("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); - return -EIO; - } - return 0; -} - -int read_fp(FILE *fp, void *buf, const size_t buf_len) -{ - size_t len_read; - - len_read = fread(buf, 1, buf_len, fp); - if (len_read != buf_len) { - pr_perror("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); - return -EIO; - } - return 0; -} - -/** - * @brief Open an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * The FILE * returned is already at the location of the first actual contents. - * - * @param path The file path - * @param write False for read, true for write - * @param size Size of actual contents - * @return FILE *if successful, NULL if failed - */ -FILE *open_img_file(char *path, bool write, size_t *size) -{ - FILE *fp = NULL; - int fd, ret; - - if (opts.stream) - fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); - else - fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); - - if (fd < 0) { - pr_perror("%s: Failed to open for %s", path, write ? "write" : "read"); - return NULL; - } - - fp = fdopen(fd, write ? "w" : "r"); - if (!fp) { - pr_perror("%s: Failed get pointer for %s", path, write ? "write" : "read"); - return NULL; - } - - if (write) - ret = write_fp(fp, size, sizeof(*size)); - else - ret = read_fp(fp, size, sizeof(*size)); - - if (ret) { - pr_perror("%s:Failed to access file size", path); - fclose(fp); - return NULL; - } - - pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); - return fp; -} /** - * @brief Write an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * - * @param path The file path - * @param buf pointer to data to be written - * @param buf_len size of buf - * @return 0 if successful. -errno on failure + * FD of KFD device used to checkpoint. On a multi-process + * tree the order of checkpointing goes from parent to child + * and so on - so saving the FD will not be overwritten */ -int write_img_file(char *path, const void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - size_t len = buf_len; +static int kfd_checkpoint_fd; - fp = open_img_file(path, true, &len); - if (!fp) - return -errno; +static LIST_HEAD(update_vma_info_list); - ret = write_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} +size_t kfd_max_buffer_size; -int read_file(const char *file_path, void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; +bool plugin_added_to_inventory = false; - fp = fopen(file_path, "r"); - if (!fp) { - pr_perror("Cannot fopen %s", file_path); - return -errno; - } +bool plugin_disabled = false; - ret = read_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} +/**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ int kmtIoctl(int fd, unsigned long request, void *arg) @@ -260,21 +103,21 @@ static void free_e(CriuKfd *e) static int allocate_device_entries(CriuKfd *e, int num_of_devices) { - e->device_entries = xmalloc(sizeof(DeviceEntry *) * num_of_devices); + e->device_entries = xmalloc(sizeof(KfdDeviceEntry *) * num_of_devices); if (!e->device_entries) { pr_err("Failed to allocate device_entries\n"); return -ENOMEM; } for (int i = 0; i < num_of_devices; i++) { - DeviceEntry *entry = xzalloc(sizeof(*entry)); + KfdDeviceEntry *entry = xzalloc(sizeof(*entry)); if (!entry) { pr_err("Failed to allocate entry\n"); return -ENOMEM; } - device_entry__init(entry); + kfd_device_entry__init(entry); e->device_entries[i] = entry; e->n_device_entries++; @@ -284,21 +127,21 @@ static int allocate_device_entries(CriuKfd *e, int num_of_devices) static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucket *bo_bucket_ptr) { - e->bo_entries = xmalloc(sizeof(BoEntry *) * num_bos); + e->bo_entries = xmalloc(sizeof(KfdBoEntry *) * num_bos); if (!e->bo_entries) { pr_err("Failed to allocate bo_info\n"); return -ENOMEM; } for (int i = 0; i < num_bos; i++) { - BoEntry *entry = xzalloc(sizeof(*entry)); + KfdBoEntry *entry = xzalloc(sizeof(*entry)); if (!entry) { pr_err("Failed to allocate botest\n"); return -ENOMEM; } - bo_entry__init(entry); + kfd_bo_entry__init(entry); e->bo_entries[i] = entry; e->n_bo_entries++; @@ -306,13 +149,13 @@ static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucke return 0; } -int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceEntry **deviceEntries) +int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDeviceEntry **deviceEntries) { uint32_t devinfo_index = 0; struct tp_node *node; list_for_each_entry(node, &sys->nodes, listm_system) { - DeviceEntry *devinfo = deviceEntries[devinfo_index++]; + KfdDeviceEntry *devinfo = deviceEntries[devinfo_index++]; devinfo->node_id = node->id; @@ -380,11 +223,11 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceE return 0; } -int devinfo_to_topology(DeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) +int devinfo_to_topology(KfdDeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) { for (int i = 0; i < num_devices; i++) { struct tp_node *node; - DeviceEntry *devinfo = devinfos[i]; + KfdDeviceEntry *devinfo = devinfos[i]; node = sys_add_node(sys, devinfo->node_id, devinfo->gpu_id); if (!node) @@ -449,9 +292,58 @@ void getenv_bool(const char *var, bool *value) pr_info("param: %s:%s\n", var, *value ? "Y" : "N"); } +void getenv_size_t(const char *var, size_t *value) +{ + char *value_str = getenv(var); + char *endp = value_str; + int sh = 0; + size_t size; + + pr_info("Value str: %s\n", value_str); + + if (value_str) { + size = (size_t)strtoul(value_str, &endp, 0); + if (errno || value_str == endp) { + pr_err("Ignoring invalid value for %s=%s, expecting a positive integer\n", var, value_str); + return; + } + switch (*endp) { + case 'k': + case 'K': + sh = 10; + break; + case 'M': + sh = 20; + break; + case 'G': + sh = 30; + break; + case '\0': + sh = 0; + break; + default: + pr_err("Ignoring invalid size suffix for %s=%s, expecting 'K'/k', 'M', or 'G'\n", var, value_str); + return; + } + if (SIZE_MAX >> sh < size) { + pr_err("Ignoring invalid value for %s=%s, exceeds SIZE_MAX\n", var, value_str); + return; + } + *value = size << sh; + } + pr_info("param: %s:0x%lx\n", var, *value); +} + int amdgpu_plugin_init(int stage) { - pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); topology_init(&dest_topology); @@ -476,12 +368,18 @@ int amdgpu_plugin_init(int stage) getenv_bool("KFD_NUMA_CHECK", &kfd_numa_check); getenv_bool("KFD_CAPABILITY_CHECK", &kfd_capability_check); } + kfd_max_buffer_size = 0; + getenv_size_t("KFD_MAX_BUFFER_SIZE", &kfd_max_buffer_size); + return 0; } void amdgpu_plugin_fini(int stage, int ret) { - pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + if (plugin_disabled) + return; + + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) sys_close_drm_render_devices(&dest_topology); @@ -501,7 +399,7 @@ struct thread_data { uint32_t gpu_id; pid_t pid; struct kfd_criu_bo_bucket *bo_buckets; - BoEntry **bo_entries; + KfdBoEntry **bo_entries; int drm_fd; int ret; int id; /* File ID used by CRIU to identify KFD image for this process */ @@ -509,38 +407,36 @@ struct thread_data { int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) { - struct stat st_kfd, st_dri_min; - char img_path[128]; + struct stat st_kfd; int ret = 0; - pr_debug("amdgpu_plugin: Enter %s\n", __func__); + pr_debug("Enter %s\n", __func__); ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { pr_perror("stat error for /dev/kfd"); return ret; } - snprintf(img_path, sizeof(img_path), "/dev/dri/renderD%d", DRM_FIRST_RENDER_NODE); - - ret = stat(img_path, &st_dri_min); - if (ret == -1) { - pr_perror("stat error for %s", img_path); - return ret; - } - - if (major(st_buf->st_rdev) == major(st_kfd.st_rdev) || ((major(st_buf->st_rdev) == major(st_dri_min.st_rdev)) && - (minor(st_buf->st_rdev) >= minor(st_dri_min.st_rdev) && - minor(st_buf->st_rdev) >= DRM_FIRST_RENDER_NODE))) { + /* If input device is KFD return device as supported */ + if (major(st_buf->st_rdev) == major(st_kfd.st_rdev)) { pr_debug("Known non-regular mapping, kfd-renderD%d -> OK\n", minor(st_buf->st_rdev)); - pr_debug("AMD KFD(maj) = %d, DRI(maj,min) = %d:%d VMA Device fd(maj,min) = %d:%d\n", - major(st_kfd.st_rdev), major(st_dri_min.st_rdev), minor(st_dri_min.st_rdev), - major(st_buf->st_rdev), minor(st_buf->st_rdev)); - /* VMA belongs to kfd */ return 0; } - pr_perror("amdgpu_plugin: Can't handle the VMA mapping"); - return -ENOTSUP; + /* Determine if input is a DRM device and therefore is supported */ + ret = amdgpu_plugin_drm_handle_device_vma(fd, st_buf); + if (ret) + pr_perror("%s(), Can't handle VMAs of input device", __func__); + + if (!ret && !plugin_added_to_inventory) { + ret = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (ret) + pr_err("Failed to add AMDGPU plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -607,16 +503,15 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type) +static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type) { - uint64_t size, gpu_addr_src, gpu_addr_dest, gpu_addr_ib; - uint64_t gpu_addr_src_orig, gpu_addr_dest_orig; - amdgpu_va_handle h_va_src, h_va_dest, h_va_ib; - amdgpu_bo_handle h_bo_src, h_bo_dest, h_bo_ib; + uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; + uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; + amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; + amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; struct amdgpu_bo_import_result res = { 0 }; - uint64_t copy_size, bytes_remain, j = 0; - uint64_t n_packets; struct amdgpu_cs_ib_info ib_info; amdgpu_bo_list_handle h_bo_list; struct amdgpu_cs_request cs_req; @@ -625,102 +520,100 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am uint32_t expired; amdgpu_context_handle h_ctx; uint32_t *ib = NULL; - int err, shared_fd; + int j, err, shared_fd, packets_per_buffer; - shared_fd = bo_buckets[i].dmabuf_fd; - size = bo_buckets[i].size; + shared_fd = bo_bucket.dmabuf_fd; + size = bo_bucket.size; + buffer_bo_size = min(size, buffer_size); + packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; + src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; + dst_bo_size = (type == SDMA_OP_VRAM_READ) ? buffer_bo_size : size; plugin_log_msg("Enter %s\n", __func__); /* prepare src buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_src); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, src_bo_size, &h_bo_src); if (err) { pr_perror("failed to create userptr for sdma"); return -EFAULT; } - break; - case SDMA_OP_VRAM_READ: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); return -EFAULT; } - h_bo_src = res.buf_handle; break; - default: pr_perror("Invalid sdma operation"); return -EINVAL; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_src, &h_va_src, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, src_bo_size, 0x1000, 0, &gpu_addr_src, + &h_va_src, 0); if (err) { pr_perror("failed to alloc VA for src bo"); goto err_src_va; } - err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_src, 0, src_bo_size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the src BO"); goto err_src_bo_map; } - plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, size); + plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, src_bo_size); + /* prepare dest buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - - h_bo_dest = res.buf_handle; + h_bo_dst = res.buf_handle; break; - case SDMA_OP_VRAM_READ: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_dest); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, dst_bo_size, &h_bo_dst); if (err) { pr_perror("failed to create userptr for sdma"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } break; - default: pr_perror("Invalid sdma operation"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_dest, &h_va_dest, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, dst_bo_size, 0x1000, 0, &gpu_addr_dst, + &h_va_dst, 0); if (err) { pr_perror("failed to alloc VA for dest bo"); - goto err_dest_va; + goto err_dst_va; } - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, dst_bo_size, gpu_addr_dst, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the dest BO"); - goto err_dest_bo_map; + goto err_dst_bo_map; } - plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dest, size); + plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dst, dst_bo_size); - n_packets = (size + max_copy_size) / max_copy_size; /* prepare ring buffer/indirect buffer for command submission * each copy packet is 7 dwords so we need to alloc 28x size for ib */ - err = alloc_and_map(h_dev, n_packets * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, + err = alloc_and_map(h_dev, packets_per_buffer * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, (void **)&ib); if (err) { pr_perror("failed to allocate and map ib/rb"); goto err_ib_gpu_alloc; } - - plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, n_packets * 28); + plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, packets_per_buffer * 28); resources[0] = h_bo_src; - resources[1] = h_bo_dest; + resources[1] = h_bo_dst; resources[2] = h_bo_ib; err = amdgpu_bo_list_create(h_dev, 3, resources, NULL, &h_bo_list); if (err) { @@ -728,103 +621,123 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am goto err_bo_list; } - memset(&cs_req, 0, sizeof(cs_req)); - memset(&fence, 0, sizeof(fence)); - memset(ib, 0, n_packets * 28); - - plugin_log_msg("setting up sdma packets for command submission\n"); bytes_remain = size; - gpu_addr_src_orig = gpu_addr_src; - gpu_addr_dest_orig = gpu_addr_dest; + if (type == SDMA_OP_VRAM_WRITE) + copy_dst = gpu_addr_dst; + else + copy_src = gpu_addr_src; + while (bytes_remain > 0) { - copy_size = min(bytes_remain, max_copy_size); - - ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); - ib[j++] = copy_size; - ib[j++] = 0; - ib[j++] = 0xffffffff & gpu_addr_src; - ib[j++] = (0xffffffff00000000 & gpu_addr_src) >> 32; - ib[j++] = 0xffffffff & gpu_addr_dest; - ib[j++] = (0xffffffff00000000 & gpu_addr_dest) >> 32; - - gpu_addr_src += copy_size; - gpu_addr_dest += copy_size; - bytes_remain -= copy_size; - } - - gpu_addr_src = gpu_addr_src_orig; - gpu_addr_dest = gpu_addr_dest_orig; - plugin_log_msg("pad the IB to align on 8 dw boundary\n"); - /* pad the IB to the required number of dw with SDMA_NOP */ - while (j & 7) - ib[j++] = SDMA_NOP; - - ib_info.ib_mc_address = gpu_addr_ib; - ib_info.size = j; - - cs_req.ip_type = AMDGPU_HW_IP_DMA; - /* possible future optimization: may use other rings, info available in - * amdgpu_query_hw_ip_info() - */ - cs_req.ring = 0; - cs_req.number_of_ibs = 1; - cs_req.ibs = &ib_info; - cs_req.resources = h_bo_list; - cs_req.fence_info.handle = NULL; - - plugin_log_msg("create the context\n"); - err = amdgpu_cs_ctx_create(h_dev, &h_ctx); - if (err) { - pr_perror("failed to create context for SDMA command submission"); - goto err_ctx; - } + memset(&cs_req, 0, sizeof(cs_req)); + memset(&fence, 0, sizeof(fence)); + memset(&ib_info, 0, sizeof(ib_info)); + memset(ib, 0, packets_per_buffer * 28); + + if (type == SDMA_OP_VRAM_WRITE) { + err = read_fp(storage_fp, buffer, min(bytes_remain, buffer_bo_size)); + if (err) { + pr_perror("failed to read from storage"); + goto err_bo_list; + } + } - plugin_log_msg("initiate sdma command submission\n"); - err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); - if (err) { - pr_perror("failed to submit command for SDMA IB"); - goto err_cs_submit_ib; - } + buffer_space_remain = buffer_bo_size; + if (type == SDMA_OP_VRAM_WRITE) + copy_src = gpu_addr_src; + else + copy_dst = gpu_addr_dst; + j = 0; + + while (bytes_remain > 0 && buffer_space_remain > 0) { + copy_size = min(min(bytes_remain, max_copy_size), buffer_space_remain); + + ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); + ib[j++] = copy_size; + ib[j++] = 0; + ib[j++] = 0xffffffff & copy_src; + ib[j++] = (0xffffffff00000000 & copy_src) >> 32; + ib[j++] = 0xffffffff & copy_dst; + ib[j++] = (0xffffffff00000000 & copy_dst) >> 32; + + copy_src += copy_size; + copy_dst += copy_size; + bytes_remain -= copy_size; + buffer_space_remain -= copy_size; + } + /* pad the IB to the required number of dw with SDMA_NOP */ + while (j & 7) + ib[j++] = SDMA_NOP; - fence.context = h_ctx; - fence.ip_type = AMDGPU_HW_IP_DMA; - fence.ip_instance = 0; - fence.ring = 0; - fence.fence = cs_req.seq_no; - err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); - if (err) { - pr_perror("failed to query fence status"); - goto err_cs_submit_ib; - } + ib_info.ib_mc_address = gpu_addr_ib; + ib_info.size = j; - if (!expired) { - pr_err("IB execution did not complete\n"); - err = -EBUSY; - goto err_cs_submit_ib; - } + cs_req.ip_type = AMDGPU_HW_IP_DMA; + /* possible future optimization: may use other rings, info available in + * amdgpu_query_hw_ip_info() + */ + cs_req.ring = 0; + cs_req.number_of_ibs = 1; + cs_req.ibs = &ib_info; + cs_req.resources = h_bo_list; + cs_req.fence_info.handle = NULL; + + err = amdgpu_cs_ctx_create(h_dev, &h_ctx); + if (err) { + pr_perror("failed to create context for SDMA command submission"); + goto err_ctx; + } + err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); + if (err) { + pr_perror("failed to submit command for SDMA IB"); + goto err_cs_submit_ib; + } + + fence.context = h_ctx; + fence.ip_type = AMDGPU_HW_IP_DMA; + fence.ip_instance = 0; + fence.ring = 0; + fence.fence = cs_req.seq_no; + err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); + if (err) { + pr_perror("failed to query fence status"); + goto err_cs_submit_ib; + } + if (!expired) { + pr_err("IB execution did not complete\n"); + err = -EBUSY; + goto err_cs_submit_ib; + } - plugin_log_msg("done querying fence status\n"); + if (type == SDMA_OP_VRAM_READ) { + err = write_fp(storage_fp, buffer, buffer_bo_size - buffer_space_remain); + if (err) { + pr_perror("failed to write out to storage"); + goto err_cs_submit_ib; + } + } err_cs_submit_ib: - amdgpu_cs_ctx_free(h_ctx); + amdgpu_cs_ctx_free(h_ctx); + if (err) + break; + } err_ctx: amdgpu_bo_list_destroy(h_bo_list); err_bo_list: - free_and_unmap(n_packets * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); + free_and_unmap(packets_per_buffer * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); err_ib_gpu_alloc: - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_UNMAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, size, gpu_addr_dst, 0, AMDGPU_VA_OP_UNMAP); if (err) - pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dest, size); -err_dest_bo_map: - err = amdgpu_va_range_free(h_va_dest); + pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dst, size); +err_dst_bo_map: + err = amdgpu_va_range_free(h_va_dst); if (err) pr_perror("dest range free failed"); -err_dest_va: - err = amdgpu_bo_free(h_bo_dest); +err_dst_va: + err = amdgpu_bo_free(h_bo_dst); if (err) pr_perror("dest bo free failed"); - -err_dest_bo_prep: +err_dst_bo_prep: err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_UNMAP); if (err) pr_perror("failed to GPU unmap the src BO %lx, size = %lx", gpu_addr_src, size); @@ -836,7 +749,6 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am err = amdgpu_bo_free(h_bo_src); if (err) pr_perror("src bo free failed"); - plugin_log_msg("Leaving sdma_copy_bo, err = %d\n", err); return err; } @@ -845,19 +757,18 @@ void *dump_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - BoEntry **bo_info = thread_data->bo_entries; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; - size_t max_bo_size = 0, image_size = 0; + size_t max_bo_size = 0, image_size = 0, buffer_size; uint64_t max_copy_size; uint32_t major, minor; int num_bos = 0; int i, ret = 0; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -884,15 +795,16 @@ void *dump_bo_contents(void *_thread_data) } } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } - snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, true, &image_size); if (!bo_contents_fp) { pr_perror("Cannot fopen %s", img_path); @@ -910,19 +822,16 @@ void *dump_bo_contents(void *_thread_data) num_bos++; /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_READ); + ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_READ); if (ret) { pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); break; } - plugin_log_msg("** Successfully drained the BO using sDMA: bo_buckets[%d] **\n", i); - ret = write_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - break; } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -939,19 +848,18 @@ void *restore_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - size_t image_size = 0, total_bo_size = 0, max_bo_size = 0; - BoEntry **bo_info = thread_data->bo_entries; + size_t image_size = 0, total_bo_size = 0, max_bo_size = 0, buffer_size; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; uint64_t max_copy_size; uint32_t major, minor; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; int num_bos = 0; int i, ret = 0; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -969,7 +877,7 @@ void *restore_bo_contents(void *_thread_data) max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : SDMA_LINEAR_COPY_MAX_SIZE - 1; - snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, false, &image_size); if (!bo_contents_fp) { pr_perror("Cannot fopen %s", img_path); @@ -977,7 +885,6 @@ void *restore_bo_contents(void *_thread_data) goto exit; } - /* Allocate buffer to fit biggest BO */ for (i = 0; i < thread_data->num_of_bos; i++) { if (bo_buckets[i].gpu_id == thread_data->gpu_id && (bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) { @@ -989,17 +896,17 @@ void *restore_bo_contents(void *_thread_data) } if (total_bo_size != image_size) { - pr_err("amdgpu_plugin: %s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, - total_bo_size); + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, total_bo_size); ret = -EINVAL; goto exit; } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } @@ -1013,11 +920,8 @@ void *restore_bo_contents(void *_thread_data) num_bos++; - ret = read_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - goto exit; - - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE); if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -1026,7 +930,7 @@ void *restore_bo_contents(void *_thread_data) } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -1054,9 +958,9 @@ int check_hsakmt_shared_mem(uint64_t *shared_mem_size, uint32_t *shared_mem_magi /* First 4 bytes of shared file is the magic */ ret = read_file(HSAKMT_SHM_PATH, shared_mem_magic, sizeof(*shared_mem_magic)); if (ret) - pr_perror("amdgpu_plugin: Failed to read shared mem magic"); + pr_perror("Failed to read shared mem magic"); else - plugin_log_msg("amdgpu_plugin: Shared mem magic:0x%x\n", *shared_mem_magic); + plugin_log_msg("Shared mem magic:0x%x\n", *shared_mem_magic); return 0; } @@ -1071,7 +975,7 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; if (!stat(HSAKMT_SHM_PATH, &st)) { - pr_debug("amdgpu_plugin: %s already exists\n", HSAKMT_SHM_PATH); + pr_debug("%s already exists\n", HSAKMT_SHM_PATH); } else { pr_info("Warning:%s was missing. Re-creating new file but we may lose perf counters\n", HSAKMT_SHM_PATH); @@ -1079,14 +983,14 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha ret = ftruncate(fd, shared_mem_size); if (ret < 0) { - pr_err("amdgpu_plugin: Failed to truncate shared mem %s\n", HSAKMT_SHM); + pr_err("Failed to truncate shared mem %s\n", HSAKMT_SHM); close(fd); return -errno; } ret = write(fd, &shared_mem_magic, sizeof(shared_mem_magic)); if (ret != sizeof(shared_mem_magic)) { - pr_perror("amdgpu_plugin: Failed to restore shared mem magic"); + pr_perror("Failed to restore shared mem magic"); close(fd); return -errno; } @@ -1112,10 +1016,14 @@ static int unpause_process(int fd) ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to unpause process"); + pr_perror("Failed to unpause process"); goto exit; } + // Reset the KFD FD + kfd_checkpoint_fd = -1; + sys_close_drm_render_devices(&src_topology); + exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); @@ -1180,7 +1088,7 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd for (i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - BoEntry *boinfo = e->bo_entries[i]; + KfdBoEntry *boinfo = e->bo_entries[i]; boinfo->gpu_id = bo_bucket->gpu_id; boinfo->addr = bo_bucket->addr; @@ -1254,7 +1162,7 @@ bool kernel_supports_criu(int fd) } if (kmtIoctl(fd, AMDKFD_IOC_GET_VERSION, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call get version ioctl"); + pr_perror("Failed to call get version ioctl"); ret = false; goto exit; } @@ -1262,8 +1170,8 @@ bool kernel_supports_criu(int fd) pr_debug("Kernel IOCTL version:%d.%02d\n", args.major_version, args.minor_version); if (args.major_version != KFD_IOCTL_MAJOR_VERSION || args.minor_version < MIN_KFD_IOCTL_MINOR_VERSION) { - pr_err("amdgpu_plugin: CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", - args.major_version, args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); + pr_err("CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", args.major_version, + args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); ret = false; goto exit; } @@ -1286,13 +1194,13 @@ int amdgpu_plugin_dump_file(int fd, int id) size_t len; if (fstat(fd, &st) == -1) { - pr_perror("amdgpu_plugin: fstat error"); + pr_perror("fstat error"); return -1; } ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { - pr_perror("amdgpu_plugin: fstat error for /dev/kfd"); + pr_perror("fstat error for /dev/kfd"); return -1; } @@ -1307,50 +1215,31 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } + /* Initialize number of device files that will be checkpointed */ + init_gpu_count(&src_topology); + /* Check whether this plugin was called for kfd or render nodes */ if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { + /* This is RenderD dumper plugin, for now just save renderD * minor number to be used during restore. In later phases this * needs to save more data for video decode etc. */ - - CriuRenderNode rd = CRIU_RENDER_NODE__INIT; - struct tp_node *tp_node; - - pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), - fd, id); - - tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); - if (!tp_node) { - pr_err("amdgpu_plugin: Failed to find a device with minor number = %d\n", minor(st.st_rdev)); - - return -ENODEV; - } - - rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd.gpu_id) - return -ENODEV; - - len = criu_render_node__get_packed_size(&rd); - buf = xmalloc(len); - if (!buf) - return -ENOMEM; - - criu_render_node__pack(&rd, buf); - - snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id); - ret = write_img_file(img_path, buf, len); - if (ret) { - xfree(buf); + ret = amdgpu_plugin_drm_dump_file(fd, id, &st); + if (ret) return ret; + + /* Invoke unpause process if needed */ + decrement_checkpoint_count(); + if (checkpoint_is_complete()) { + ret = unpause_process(kfd_checkpoint_fd); } - xfree(buf); /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; } - pr_info("amdgpu_plugin: %s : %s() called for fd = %d\n", CR_PLUGIN_DESC.name, __func__, major(st.st_rdev)); + pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); /* KFD only allows ioctl calls from the same process that opened the KFD file descriptor. * The existing /dev/kfd file descriptor that is passed in is only allowed to do IOCTL calls with @@ -1362,13 +1251,13 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_PROCESS_INFO; if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call process info ioctl"); + pr_perror("Failed to call process info ioctl"); ret = -1; goto exit; } - pr_info("amdgpu_plugin: devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, - args.num_objects, args.priv_data_size); + pr_info("devices:%" PRIu32 " bos:%" PRIu32 " objects:%" PRIu32 " priv_data:%" PRIu64 "\n", + args.num_devices, args.num_bos, args.num_objects, args.priv_data_size); e = xmalloc(sizeof(*e)); if (!e) { @@ -1401,7 +1290,7 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_CHECKPOINT; ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to call dumper (process) ioctl"); + pr_perror("Failed to call dumper (process) ioctl"); goto exit; } @@ -1423,11 +1312,11 @@ int amdgpu_plugin_dump_file(int fd, int id) goto exit; snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); - pr_info("amdgpu_plugin: img_path = %s\n", img_path); + pr_info("img_path = %s\n", img_path); len = criu_kfd__get_packed_size(e); - pr_info("amdgpu_plugin: Len = %ld\n", len); + pr_info("Len = %ld\n", len); buf = xmalloc(len); if (!buf) { @@ -1441,11 +1330,15 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = write_img_file(img_path, buf, len); xfree(buf); + exit: - /* Restore all queues */ - unpause_process(fd); + /* Restore all queues if conditions permit */ + kfd_checkpoint_fd = fd; + decrement_checkpoint_count(); + if (checkpoint_is_complete()) { + ret = unpause_process(fd); + } - sys_close_drm_render_devices(&src_topology); xfree((void *)args.devices); xfree((void *)args.bos); xfree((void *)args.priv_data); @@ -1453,9 +1346,9 @@ int amdgpu_plugin_dump_file(int fd, int id) free_e(e); if (ret) - pr_err("amdgpu_plugin: Failed to dump (ret:%d)\n", ret); + pr_err("Failed to dump (ret:%d)\n", ret); else - pr_info("amdgpu_plugin: Dump successful\n"); + pr_info("Dump successful\n"); return ret; } @@ -1478,7 +1371,7 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) { struct kfd_criu_device_bucket *device_bucket; - DeviceEntry *devinfo = e->device_entries[entries_i]; + KfdDeviceEntry *devinfo = e->device_entries[entries_i]; struct tp_node *tp_node; if (!devinfo->gpu_id) @@ -1501,10 +1394,10 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) device_bucket->drm_fd = node_get_drm_render_device(tp_node); if (device_bucket->drm_fd < 0) { - pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver"); + pr_perror("Can't pass NULL drm render fd to driver"); goto exit; } else { - pr_info("amdgpu_plugin: passing drm render fd = %d to driver\n", device_bucket->drm_fd); + pr_info("passing drm render fd = %d to driver\n", device_bucket->drm_fd); } } @@ -1528,7 +1421,7 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int i = 0; i < args->num_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - BoEntry *bo_entry = e->bo_entries[i]; + KfdBoEntry *bo_entry = e->bo_entries[i]; bo_bucket->gpu_id = bo_entry->gpu_id; bo_bucket->addr = bo_entry->addr; @@ -1588,7 +1481,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf vma_md->new_pgoff = bo_bucket->restored_offset; vma_md->fd = node_get_drm_render_device(tp_node); - plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx " + plugin_log_msg("adding vma_entry:addr:0x%lx old-off:0x%lx " "new_off:0x%lx new_minor:%d\n", vma_md->vma_entry, vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor); @@ -1669,7 +1562,10 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; - pr_info("amdgpu_plugin: Initialized kfd plugin restorer with ID = %d\n", id); + if (plugin_disabled) + return -ENOTSUP; + + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1683,7 +1579,7 @@ int amdgpu_plugin_restore_file(int id) * TODO: Currently, this code will only work if this function is called for /dev/kfd * first as we assume restore_maps is already filled. Need to fix this later. */ - snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id); + snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); pr_info("Restoring RenderD %s\n", img_path); img_fp = open_img_file(img_path, false, &img_size); @@ -1713,7 +1609,7 @@ int amdgpu_plugin_restore_file(int id) } fclose(img_fp); - pr_info("amdgpu_plugin: render node gpu_id = 0x%04x\n", rd->gpu_id); + pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id); target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id); if (!target_gpu_id) { @@ -1727,11 +1623,11 @@ int amdgpu_plugin_restore_file(int id) goto fail; } - pr_info("amdgpu_plugin: render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); + pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); fd = node_get_drm_render_device(tp_node); if (fd < 0) - pr_err("amdgpu_plugin: Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); @@ -1743,7 +1639,12 @@ int amdgpu_plugin_restore_file(int id) * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in * tp_node. */ - return dup(fd); + fd = dup(fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + return fd; } fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1752,7 +1653,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - pr_info("amdgpu_plugin: Opened kfd, fd = %d\n", fd); + pr_info("Opened kfd, fd = %d\n", fd); if (!kernel_supports_criu(fd)) return -ENOTSUP; @@ -1780,7 +1681,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - plugin_log_msg("amdgpu_plugin: read image file data\n"); + plugin_log_msg("read image file data\n"); /* * Initialize fd_next to be 1 greater than the biggest file descriptor in use by the target restore process. @@ -1847,10 +1748,10 @@ int amdgpu_plugin_restore_file(int id) xfree(buf); if (ret) { - pr_err("amdgpu_plugin: Failed to restore (ret:%d)\n", ret); + pr_err("Failed to restore (ret:%d)\n", ret); fd = ret; } else { - pr_info("amdgpu_plugin: Restore successful (fd:%d)\n", fd); + pr_info("Restore successful (fd:%d)\n", fd); } return fd; @@ -1870,7 +1771,10 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; - plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__); + if (plugin_disabled) + return -ENOTSUP; + + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1903,13 +1807,18 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) { *new_offset = vma_md->new_pgoff; - if (is_renderD) - *updated_fd = vma_md->fd; - else - *updated_fd = -1; + *updated_fd = -1; + if (is_renderD) { + int fd = dup(vma_md->fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + *updated_fd = fd; + } - plugin_log_msg("amdgpu_plugin: old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, - vma_md->new_pgoff, *updated_fd); + plugin_log_msg("old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, vma_md->new_pgoff, + *updated_fd); return 1; } @@ -1922,26 +1831,34 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, amdgpu_plugin_update_vma int amdgpu_plugin_resume_devices_late(int target_pid) { struct kfd_ioctl_criu_args args = { 0 }; - int fd, ret = 0; + int fd, exit_code = 0; - pr_info("amdgpu_plugin: Inside %s for target pid = %d\n", __func__, target_pid); + if (plugin_disabled) + return -ENOTSUP; + + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); if (fd < 0) { pr_perror("failed to open kfd in plugin"); - return -1; + return -ENOTSUP; } args.pid = target_pid; args.op = KFD_CRIU_OP_RESUME; - pr_info("amdgpu_plugin: Calling IOCTL to start notifiers and queues\n"); + pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("restore late ioctl failed"); - ret = -1; + if (errno == ESRCH) { + pr_info("Pid %d has no kfd process info\n", target_pid); + exit_code = -ENOTSUP; + } else { + pr_perror("restore late ioctl failed"); + exit_code = -1; + } } close(fd); - return ret; + return exit_code; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c new file mode 100644 index 0000000000..d54cd937d5 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -0,0 +1,100 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" + +#include "criu-amdgpu.pb-c.h" + +#include +#include + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + + +int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) +{ + char path[PATH_MAX]; + struct stat drm; + int ret = 0; + + snprintf(path, sizeof(path), AMDGPU_DRM_DEVICE, DRM_FIRST_RENDER_NODE); + ret = stat(path, &drm); + if (ret == -1) { + pr_err("Error in getting stat for: %s\n", path); + return ret; + } + + if ((major(st->st_rdev) != major(drm.st_rdev)) || + (minor(st->st_rdev) < minor(drm.st_rdev)) || + (minor(st->st_rdev) > DRM_LAST_RENDER_NODE)) { + pr_err("Can't handle VMA mapping of input device\n"); + return -ENOTSUP; + } + + pr_debug("AMD DRI(maj,min) = %d:%d VMA Device FD(maj,min) = %d:%d\n", + major(drm.st_rdev), minor(drm.st_rdev), + major(st->st_rdev), minor(st->st_rdev)); + + return 0; +} + + +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) +{ + CriuRenderNode rd = CRIU_RENDER_NODE__INIT; + struct tp_node *tp_node; + char path[PATH_MAX]; + unsigned char *buf; + int minor; + int len; + int ret; + + /* Get the topology node of the DRM device */ + minor = minor(drm->st_rdev); + tp_node = sys_get_node_by_render_minor(&src_topology, minor); + if (!tp_node) { + pr_err("Failed to find a device with minor number = %d\n", minor); + return -ENODEV; + } + + /* Get the GPU_ID of the DRM device */ + rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd.gpu_id) { + pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id); + return -ENODEV; + } + + len = criu_render_node__get_packed_size(&rd); + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + + criu_render_node__pack(&rd, buf); + + snprintf(path, sizeof(path), IMG_DRM_FILE, id); + ret = write_img_file(path, buf, len); + xfree(buf); + return ret; +} diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h new file mode 100644 index 0000000000..6f0c1a9a63 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -0,0 +1,28 @@ +#ifndef __AMDGPU_PLUGIN_DRM_H__ +#define __AMDGPU_PLUGIN_DRM_H__ + +#include +#include "common/list.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + + +/** + * Determines if VMA's of input file descriptor belong to amdgpu's + * DRM device and are therefore supported + */ +int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); + +/** + * Serialize meta-data about a particular DRM device, its number of BOs, + * etc into a file. The serialized filename has in it the value ID that + * is passed in as a parameter + */ +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); + +#endif /* __AMDGPU_PLUGIN_DRM_H__ */ + diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 42689933ee..5b4396a0cc 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -16,34 +16,11 @@ #include "xmalloc.h" #include "kfd_ioctl.h" +#include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" #define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/" - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef COMPILE_TESTS -#undef pr_err -#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) -#undef pr_info -#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) -#undef pr_debug -#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) - -#undef pr_perror -#define pr_perror(format, arg...) \ - fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#endif - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif +#define MAX_PARAMETER_LEN 64 /* User override options */ /* Skip firmware version check */ @@ -441,7 +418,9 @@ struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id static bool get_prop(char *line, char *name, uint64_t *value) { - if (sscanf(line, " %29s %lu", name, value) != 2) + char format[16]; + sprintf(format, " %%%ds %%lu", MAX_PARAMETER_LEN); + if (sscanf(line, format, name, value) != 2) return false; return true; } @@ -461,7 +440,7 @@ static int parse_topo_node_properties(struct tp_node *dev, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -589,7 +568,7 @@ static int parse_topo_node_mem_banks(struct tp_node *node, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -678,7 +657,7 @@ static int parse_topo_node_iolinks(struct tp_node *node, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -840,6 +819,9 @@ void topology_free(struct tp_system *sys) list_del(&p2pgroup->listm_system); xfree(p2pgroup); } + + /* Update Topology as being freed */ + sys->parsed = false; } /** @@ -1063,7 +1045,7 @@ static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest) * * Nodes compatibility are determined by: * 1. Comparing the node properties - * 2. Making sure iolink mappings to CPUs would be compabitle with existing iolink mappings in maps + * 2. Making sure iolink mappings to CPUs would be compatible with existing iolink mappings in maps * * If src_node and dest_node are mappable, then map_device will push the new mapping * for src_node -> dest_node into new_maps. @@ -1241,7 +1223,7 @@ static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, s return true; } else { /* We could not map remaining nodes in the list. Add dest node back - * to list and try to map next dest ndoe in list to current src + * to list and try to map next dest node in list to current src * node. */ pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, " @@ -1461,3 +1443,15 @@ int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys, return ret; } + +int topology_gpu_count(struct tp_system *sys) +{ + struct tp_node *node; + int count = 0; + + list_for_each_entry(node, &sys->nodes, listm_system) + if (NODE_IS_GPU(node)) + count++; + return count; +} + diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index 9d99cda1c2..c890e3ddae 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -107,6 +107,8 @@ int topology_parse(struct tp_system *topology, const char *msg); int topology_determine_iolinks(struct tp_system *sys); void topology_print(const struct tp_system *sys, const char *msg); +int topology_gpu_count(struct tp_system *topology); + struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id); struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id); diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c new file mode 100644 index 0000000000..a165fc9cd5 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -0,0 +1,206 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" + +#include +#include + +#include "criu-plugin.h" +#include "plugin.h" +#include "criu-amdgpu.pb-c.h" + +#include "img-streamer.h" +#include "image.h" +#include "cr_options.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + +/* Tracks number of device files that need to be checkpointed */ +static int dev_file_cnt = 0; + +/* Helper structures to encode device topology of SRC and DEST platforms */ +struct tp_system src_topology; +struct tp_system dest_topology; + +/* Helper structures to encode device maps during Checkpoint and Restore operations */ +struct device_maps checkpoint_maps; +struct device_maps restore_maps; + +bool checkpoint_is_complete() +{ + return (dev_file_cnt == 0); +} + +void decrement_checkpoint_count() +{ + dev_file_cnt--; +} + +void init_gpu_count(struct tp_system *topo) +{ + if (dev_file_cnt != 0) + return; + + /* We add ONE to include checkpointing of KFD device */ + dev_file_cnt = 1 + topology_gpu_count(topo); +} + +int read_fp(FILE *fp, void *buf, const size_t buf_len) +{ + size_t len_read; + + len_read = fread(buf, 1, buf_len, fp); + if (len_read != buf_len) { + pr_err("Unable to read file (read:%ld buf_len:%ld)\n", len_read, buf_len); + return -EIO; + } + return 0; +} + +int write_fp(FILE *fp, const void *buf, const size_t buf_len) +{ + size_t len_write; + + len_write = fwrite(buf, 1, buf_len, fp); + if (len_write != buf_len) { + pr_err("Unable to write file (wrote:%ld buf_len:%ld)\n", len_write, buf_len); + return -EIO; + } + return 0; +} + +/** + * @brief Open an image file + * + * We store the size of the actual contents in the first 8-bytes of + * the file. This allows us to determine the file size when using + * criu_image_streamer when fseek and fstat are not available. The + * FILE * returned is already at the location of the first actual + * contents. + * + * @param path The file path + * @param write False for read, true for write + * @param size Size of actual contents + * @return FILE *if successful, NULL if failed + */ +FILE *open_img_file(char *path, bool write, size_t *size) +{ + FILE *fp = NULL; + int fd, ret; + + if (opts.stream) + fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); + else + fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); + + if (fd < 0) { + pr_err("%s: Failed to open for %s\n", path, write ? "write" : "read"); + return NULL; + } + + fp = fdopen(fd, write ? "w" : "r"); + if (!fp) { + pr_err("%s: Failed get pointer for %s\n", path, write ? "write" : "read"); + return NULL; + } + + if (write) + ret = write_fp(fp, size, sizeof(*size)); + else + ret = read_fp(fp, size, sizeof(*size)); + + if (ret) { + pr_err("%s:Failed to access file size\n", path); + fclose(fp); + return NULL; + } + + pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); + return fp; +} + +int read_file(const char *file_path, void *buf, const size_t buf_len) +{ + int ret; + FILE *fp; + + fp = fopen(file_path, "r"); + if (!fp) { + pr_err("Cannot fopen %s\n", file_path); + return -errno; + } + + ret = read_fp(fp, buf, buf_len); + fclose(fp); /* this will also close fd */ + return ret; +} + + +/** + * @brief Write an image file + * + * We store the size of the actual contents in the first 8-bytes of the file. This allows us to + * determine the file size when using criu_image_streamer when fseek and fstat are not available. + * + * @param path The file path + * @param buf pointer to data to be written + * @param buf_len size of buf + * @return 0 if successful. -errno on failure + */ +int write_img_file(char *path, const void *buf, const size_t buf_len) +{ + int ret; + FILE *fp; + size_t len = buf_len; + + fp = open_img_file(path, true, &len); + if (!fp) + return -errno; + + ret = write_fp(fp, buf, buf_len); + fclose(fp); /* this will also close fd */ + return ret; +} + +void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) +{ + struct kfd_criu_bo_bucket *bo; + + pr_info("\n"); + for (int idx = 0; idx < bo_cnt; idx++) { + bo = &bo_list[idx]; + pr_info("\n"); + pr_info("%s(), %d. KFD BO Addr: %" PRIx64 " \n", __func__, idx, bo->addr); + pr_info("%s(), %d. KFD BO Size: %" PRIx64 " \n", __func__, idx, bo->size); + pr_info("%s(), %d. KFD BO Offset: %" PRIx64 " \n", __func__, idx, bo->offset); + pr_info("%s(), %d. KFD BO Restored Offset: %" PRIx64 " \n", __func__, idx, bo->restored_offset); + pr_info("%s(), %d. KFD BO Alloc Flags: %x \n", __func__, idx, bo->alloc_flags); + pr_info("%s(), %d. KFD BO Gpu ID: %x \n", __func__, idx, bo->gpu_id); + pr_info("%s(), %d. KFD BO Dmabuf FD: %x \n", __func__, idx, bo->dmabuf_fd); + pr_info("\n"); + } + pr_info("\n"); +} diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h new file mode 100644 index 0000000000..aacca3a28c --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -0,0 +1,106 @@ +#ifndef __AMDGPU_PLUGIN_UTIL_H__ +#define __AMDGPU_PLUGIN_UTIL_H__ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif + +#ifdef COMPILE_TESTS +#undef pr_err +#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) +#undef pr_info +#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) +#undef pr_debug +#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) + +#undef pr_perror +#define pr_perror(format, arg...) \ + fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#endif + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "amdgpu_plugin: " + +#ifdef DEBUG +#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) +#else +#define plugin_log_msg(fmt, ...) \ + { \ + } +#endif + + +/* Path where KFD device is surfaced */ +#define AMDGPU_KFD_DEVICE "/dev/kfd" + +/* Path where DRM devices are surfaced */ +#define AMDGPU_DRM_DEVICE "/dev/dri/renderD%d" + +/* Minimum version of KFD IOCTL's that supports C&R */ +#define KFD_IOCTL_MAJOR_VERSION 1 +#define MIN_KFD_IOCTL_MINOR_VERSION 8 + +/* Name of file having serialized data of KFD device */ +#define IMG_KFD_FILE "amdgpu-kfd-%d.img" + +/* Name of file having serialized data of KFD buffer objects (BOs) */ +#define IMG_KFD_PAGES_FILE "amdgpu-pages-%d-%04x.img" + +/* Name of file having serialized data of DRM device */ +#define IMG_DRM_FILE "amdgpu-renderD-%d.img" + +/* Name of file having serialized data of DRM device buffer objects (BOs) */ +#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img" + +/* Helper macros to Checkpoint and Restore a ROCm file */ +#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" +#define HSAKMT_SHM "/hsakmt_shared_mem" +#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" +#define HSAKMT_SEM "hsakmt_semaphore" + +/* Help macros to build sDMA command packets */ +#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) + +#define SDMA_OPCODE_COPY 1 +#define SDMA_COPY_SUB_OPCODE_LINEAR 0 +#define SDMA_NOP 0 +#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) + +enum sdma_op_type { + SDMA_OP_VRAM_READ, + SDMA_OP_VRAM_WRITE, +}; + +/* Helper structures to encode device topology of SRC and DEST platforms */ +extern struct tp_system src_topology; +extern struct tp_system dest_topology; + +/* Helper structures to encode device maps during Checkpoint and Restore operations */ +extern struct device_maps checkpoint_maps; +extern struct device_maps restore_maps; + +extern int fd_next; + +extern bool kfd_fw_version_check; +extern bool kfd_sdma_fw_version_check; +extern bool kfd_caches_count_check; +extern bool kfd_num_gws_check; +extern bool kfd_vram_size_check; +extern bool kfd_numa_check; +extern bool kfd_capability_check; + +int read_fp(FILE *fp, void *buf, const size_t buf_len); +int write_fp(FILE *fp, const void *buf, const size_t buf_len); +int read_file(const char *file_path, void *buf, const size_t buf_len); +int write_img_file(char *path, const void *buf, const size_t buf_len); +FILE *open_img_file(char *path, bool write, size_t *size); + +bool checkpoint_is_complete(); +void decrement_checkpoint_count(); +void init_gpu_count(struct tp_system *topology); + +void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list); + +#endif /* __AMDGPU_PLUGIN_UTIL_H__ */ diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 81d00d3ff1..078b676500 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -5,7 +5,7 @@ message dev_iolink { required uint32 node_to_id = 2; } -message device_entry { +message kfd_device_entry { required uint32 node_id = 1; required uint32 gpu_id = 2; required uint32 cpu_cores_count = 3; @@ -40,10 +40,10 @@ message device_entry { repeated dev_iolink iolinks = 32; } -message bo_entry { - required uint64 addr = 1; - required uint64 size = 2; - required uint64 offset = 3; +message kfd_bo_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; required uint32 alloc_flags = 4; required uint32 gpu_id = 5; } @@ -52,10 +52,10 @@ message criu_kfd { required uint32 pid = 1; required uint32 num_of_gpus = 2; required uint32 num_of_cpus = 3; - repeated device_entry device_entries = 4; - required uint64 num_of_bos = 5; - repeated bo_entry bo_entries = 6; - required uint32 num_of_objects = 7; + repeated kfd_device_entry device_entries = 4; + required uint64 num_of_bos = 5; + repeated kfd_bo_entry bo_entries = 6; + required uint32 num_of_objects = 7; required uint64 shared_mem_size = 8; required uint32 shared_mem_magic = 9; required bytes priv_data = 10; diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index b88fe20cfe..1a3bcea955 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -23,7 +23,7 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include +#include #include /* @@ -39,8 +39,8 @@ #define KFD_IOCTL_MINOR_VERSION 8 struct kfd_ioctl_get_version_args { - __u32 major_version; /* from KFD */ - __u32 minor_version; /* from KFD */ + uint32_t major_version; /* from KFD */ + uint32_t minor_version; /* from KFD */ }; /* For kfd_ioctl_create_queue_args.queue_type. */ @@ -53,51 +53,51 @@ struct kfd_ioctl_get_version_args { #define KFD_MAX_QUEUE_PRIORITY 15 struct kfd_ioctl_create_queue_args { - __u64 ring_base_address; /* to KFD */ - __u64 write_pointer_address; /* from KFD */ - __u64 read_pointer_address; /* from KFD */ - __u64 doorbell_offset; /* from KFD */ + uint64_t ring_base_address; /* to KFD */ + uint64_t write_pointer_address; /* from KFD */ + uint64_t read_pointer_address; /* from KFD */ + uint64_t doorbell_offset; /* from KFD */ - __u32 ring_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 queue_type; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ - __u32 queue_id; /* from KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t queue_type; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ + uint32_t queue_id; /* from KFD */ - __u64 eop_buffer_address; /* to KFD */ - __u64 eop_buffer_size; /* to KFD */ - __u64 ctx_save_restore_address; /* to KFD */ - __u32 ctx_save_restore_size; /* to KFD */ - __u32 ctl_stack_size; /* to KFD */ + uint64_t eop_buffer_address; /* to KFD */ + uint64_t eop_buffer_size; /* to KFD */ + uint64_t ctx_save_restore_address; /* to KFD */ + uint32_t ctx_save_restore_size; /* to KFD */ + uint32_t ctl_stack_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { - __u32 queue_id; /* to KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_update_queue_args { - __u64 ring_base_address; /* to KFD */ + uint64_t ring_base_address; /* to KFD */ - __u32 queue_id; /* to KFD */ - __u32 ring_size; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ }; struct kfd_ioctl_set_cu_mask_args { - __u32 queue_id; /* to KFD */ - __u32 num_cu_mask; /* to KFD */ - __u64 cu_mask_ptr; /* to KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t num_cu_mask; /* to KFD */ + uint64_t cu_mask_ptr; /* to KFD */ }; struct kfd_ioctl_get_queue_wave_state_args { - __u64 ctl_stack_address; /* to KFD */ - __u32 ctl_stack_used_size; /* from KFD */ - __u32 save_area_used_size; /* from KFD */ - __u32 queue_id; /* to KFD */ - __u32 pad; + uint64_t ctl_stack_address; /* to KFD */ + uint32_t ctl_stack_used_size; /* from KFD */ + uint32_t save_area_used_size; /* from KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ @@ -105,13 +105,13 @@ struct kfd_ioctl_get_queue_wave_state_args { #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 struct kfd_ioctl_set_memory_policy_args { - __u64 alternate_aperture_base; /* to KFD */ - __u64 alternate_aperture_size; /* to KFD */ + uint64_t alternate_aperture_base; /* to KFD */ + uint64_t alternate_aperture_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 default_policy; /* to KFD */ - __u32 alternate_policy; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t default_policy; /* to KFD */ + uint32_t alternate_policy; /* to KFD */ + uint32_t pad; }; /* @@ -122,24 +122,24 @@ struct kfd_ioctl_set_memory_policy_args { */ struct kfd_ioctl_get_clock_counters_args { - __u64 gpu_clock_counter; /* from KFD */ - __u64 cpu_clock_counter; /* from KFD */ - __u64 system_clock_counter; /* from KFD */ - __u64 system_clock_freq; /* from KFD */ + uint64_t gpu_clock_counter; /* from KFD */ + uint64_t cpu_clock_counter; /* from KFD */ + uint64_t system_clock_counter; /* from KFD */ + uint64_t system_clock_freq; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_process_device_apertures { - __u64 lds_base; /* from KFD */ - __u64 lds_limit; /* from KFD */ - __u64 scratch_base; /* from KFD */ - __u64 scratch_limit; /* from KFD */ - __u64 gpuvm_base; /* from KFD */ - __u64 gpuvm_limit; /* from KFD */ - __u32 gpu_id; /* from KFD */ - __u32 pad; + uint64_t lds_base; /* from KFD */ + uint64_t lds_limit; /* from KFD */ + uint64_t scratch_base; /* from KFD */ + uint64_t scratch_limit; /* from KFD */ + uint64_t gpuvm_base; /* from KFD */ + uint64_t gpuvm_limit; /* from KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t pad; }; /* @@ -152,20 +152,20 @@ struct kfd_ioctl_get_process_apertures_args { struct kfd_process_device_apertures process_apertures[NUM_OF_SUPPORTED_GPUS]; /* from KFD */ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; }; struct kfd_ioctl_get_process_apertures_new_args { /* User allocated. Pointer to struct kfd_process_device_apertures * filled in by Kernel */ - __u64 kfd_process_device_apertures_ptr; + uint64_t kfd_process_device_apertures_ptr; /* to KFD - indicates amount of memory present in kfd_process_device_apertures_ptr * from KFD - Number of entries filled by KFD. */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; }; #define MAX_ALLOWED_NUM_POINTS 100 @@ -173,25 +173,25 @@ struct kfd_ioctl_get_process_apertures_new_args { #define MAX_ALLOWED_WAC_BUFF_SIZE 128 struct kfd_ioctl_dbg_register_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_unregister_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_address_watch_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; struct kfd_ioctl_dbg_wave_control_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; #define KFD_INVALID_FD 0xffffffff @@ -228,43 +228,43 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_MEM_ERR_GPU_HANG 3 struct kfd_ioctl_create_event_args { - __u64 event_page_offset; /* from KFD */ - __u32 event_trigger_data; /* from KFD - signal events only */ - __u32 event_type; /* to KFD */ - __u32 auto_reset; /* to KFD */ - __u32 node_id; /* to KFD - only valid for certain event types */ - __u32 event_id; /* from KFD */ - __u32 event_slot_index; /* from KFD */ + uint64_t event_page_offset; /* from KFD */ + uint32_t event_trigger_data; /* from KFD - signal events only */ + uint32_t event_type; /* to KFD */ + uint32_t auto_reset; /* to KFD */ + uint32_t node_id; /* to KFD - only valid for certain event types */ + uint32_t event_id; /* from KFD */ + uint32_t event_slot_index; /* from KFD */ }; struct kfd_ioctl_destroy_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_set_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_reset_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_memory_exception_failure { - __u32 NotPresent; /* Page not present or supervisor privilege */ - __u32 ReadOnly; /* Write access to a read-only page */ - __u32 NoExecute; /* Execute access to a page marked NX */ - __u32 imprecise; /* Can't determine the exact fault address */ + uint32_t NotPresent; /* Page not present or supervisor privilege */ + uint32_t ReadOnly; /* Write access to a read-only page */ + uint32_t NoExecute; /* Execute access to a page marked NX */ + uint32_t imprecise; /* Can't determine the exact fault address */ }; /* memory exception data */ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; - __u64 va; - __u32 gpu_id; - __u32 ErrorType; /* 0 = no RAS error, + uint64_t va; + uint32_t gpu_id; + uint32_t ErrorType; /* 0 = no RAS error, * 1 = ECC_SRAM, * 2 = Link_SYNFLOOD (poison), * 3 = GPU hang (not attributable to a specific cause), @@ -274,10 +274,10 @@ struct kfd_hsa_memory_exception_data { /* hw exception data */ struct kfd_hsa_hw_exception_data { - __u32 reset_type; - __u32 reset_cause; - __u32 memory_lost; - __u32 gpu_id; + uint32_t reset_type; + uint32_t reset_cause; + uint32_t memory_lost; + uint32_t gpu_id; }; /* Event data */ @@ -286,57 +286,57 @@ struct kfd_event_data { struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; }; /* From KFD */ - __u64 kfd_event_data_ext; /* pointer to an extension structure for future exception types */ - __u32 event_id; /* to KFD */ - __u32 pad; + uint64_t kfd_event_data_ext; /* pointer to an extension structure for future exception types */ + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_wait_events_args { - __u64 events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - __u32 num_events; /* to KFD */ - __u32 wait_for_all; /* to KFD */ - __u32 timeout; /* to KFD */ - __u32 wait_result; /* from KFD */ + uint64_t events_ptr; /* pointed to struct kfd_event_data array, to KFD */ + uint32_t num_events; /* to KFD */ + uint32_t wait_for_all; /* to KFD */ + uint32_t timeout; /* to KFD */ + uint32_t wait_result; /* from KFD */ }; struct kfd_ioctl_set_scratch_backing_va_args { - __u64 va_addr; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint64_t va_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_get_tile_config_args { /* to KFD: pointer to tile array */ - __u64 tile_config_ptr; + uint64_t tile_config_ptr; /* to KFD: pointer to macro tile array */ - __u64 macro_tile_config_ptr; + uint64_t macro_tile_config_ptr; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - __u32 num_tile_configs; + uint32_t num_tile_configs; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - __u32 num_macro_tile_configs; + uint32_t num_macro_tile_configs; - __u32 gpu_id; /* to KFD */ - __u32 gb_addr_config; /* from KFD */ - __u32 num_banks; /* from KFD */ - __u32 num_ranks; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t gb_addr_config; /* from KFD */ + uint32_t num_banks; /* from KFD */ + uint32_t num_ranks; /* from KFD */ /* struct size can be extended later if needed without breaking ABI compatibility */ }; struct kfd_ioctl_set_trap_handler_args { - __u64 tba_addr; /* to KFD */ - __u64 tma_addr; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint64_t tba_addr; /* to KFD */ + uint64_t tma_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_acquire_vm_args { - __u32 drm_fd; /* to KFD */ - __u32 gpu_id; /* to KFD */ + uint32_t drm_fd; /* to KFD */ + uint32_t gpu_id; /* to KFD */ }; /* Allocation flags: memory types */ @@ -367,12 +367,12 @@ struct kfd_ioctl_acquire_vm_args { * @flags: memory type and attributes. See KFD_IOC_ALLOC_MEM_FLAGS above */ struct kfd_ioctl_alloc_memory_of_gpu_args { - __u64 va_addr; /* to KFD */ - __u64 size; /* to KFD */ - __u64 handle; /* from KFD */ - __u64 mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ - __u32 gpu_id; /* to KFD */ - __u32 flags; + uint64_t va_addr; /* to KFD */ + uint64_t size; /* to KFD */ + uint64_t handle; /* from KFD */ + uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ + uint32_t gpu_id; /* to KFD */ + uint32_t flags; }; /* Free memory allocated with kfd_ioctl_alloc_memory_of_gpu @@ -380,13 +380,13 @@ struct kfd_ioctl_alloc_memory_of_gpu_args { * @handle: memory handle returned by alloc */ struct kfd_ioctl_free_memory_of_gpu_args { - __u64 handle; /* to KFD */ + uint64_t handle; /* to KFD */ }; /* Map memory to one or more GPUs * * @handle: memory handle returned by alloc - * @device_ids_array_ptr: array of gpu_ids (__u32 per device) + * @device_ids_array_ptr: array of gpu_ids (uint32_t per device) * @n_devices: number of devices in the array * @n_success: number of devices mapped successfully * @@ -399,10 +399,10 @@ struct kfd_ioctl_free_memory_of_gpu_args { * n_devices. */ struct kfd_ioctl_map_memory_to_gpu_args { - __u64 handle; /* to KFD */ - __u64 device_ids_array_ptr; /* to KFD */ - __u32 n_devices; /* to KFD */ - __u32 n_success; /* to/from KFD */ + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t n_devices; /* to KFD */ + uint32_t n_success; /* to/from KFD */ }; /* Unmap memory from one or more GPUs @@ -410,10 +410,10 @@ struct kfd_ioctl_map_memory_to_gpu_args { * same arguments as for mapping */ struct kfd_ioctl_unmap_memory_from_gpu_args { - __u64 handle; /* to KFD */ - __u64 device_ids_array_ptr; /* to KFD */ - __u32 n_devices; /* to KFD */ - __u32 n_success; /* to/from KFD */ + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t n_devices; /* to KFD */ + uint32_t n_success; /* to/from KFD */ }; /* Allocate GWS for specific queue @@ -424,28 +424,28 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { * only support contiguous GWS allocation */ struct kfd_ioctl_alloc_queue_gws_args { - __u32 queue_id; /* to KFD */ - __u32 num_gws; /* to KFD */ - __u32 first_gws; /* from KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t num_gws; /* to KFD */ + uint32_t first_gws; /* from KFD */ + uint32_t pad; }; struct kfd_ioctl_get_dmabuf_info_args { - __u64 size; /* from KFD */ - __u64 metadata_ptr; /* to KFD */ - __u32 metadata_size; /* to KFD (space allocated by user) + uint64_t size; /* from KFD */ + uint64_t metadata_ptr; /* to KFD */ + uint32_t metadata_size; /* to KFD (space allocated by user) * from KFD (actual metadata size) */ - __u32 gpu_id; /* from KFD */ - __u32 flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ - __u32 dmabuf_fd; /* to KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ + uint32_t dmabuf_fd; /* to KFD */ }; struct kfd_ioctl_import_dmabuf_args { - __u64 va_addr; /* to KFD */ - __u64 handle; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 dmabuf_fd; /* to KFD */ + uint64_t va_addr; /* to KFD */ + uint64_t handle; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t dmabuf_fd; /* to KFD */ }; /* @@ -463,8 +463,8 @@ enum kfd_smi_event { #define KFD_SMI_EVENT_MSG_SIZE 96 struct kfd_ioctl_smi_events_args { - __u32 gpuid; /* to KFD */ - __u32 anon_fd; /* from KFD */ + uint32_t gpuid; /* to KFD */ + uint32_t anon_fd; /* from KFD */ }; /************************************************************************************************** @@ -510,33 +510,33 @@ enum kfd_criu_op { * Return: 0 on success, -errno on failure */ struct kfd_ioctl_criu_args { - __u64 devices; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 bos; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 priv_data; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 pid; /* Used during ops: PROCESS_INFO, RESUME */ - __u32 op; + uint64_t devices; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t bos; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t priv_data; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t pid; /* Used during ops: PROCESS_INFO, RESUME */ + uint32_t op; }; struct kfd_criu_device_bucket { - __u32 user_gpu_id; - __u32 actual_gpu_id; - __u32 drm_fd; - __u32 pad; + uint32_t user_gpu_id; + uint32_t actual_gpu_id; + uint32_t drm_fd; + uint32_t pad; }; struct kfd_criu_bo_bucket { - __u64 addr; - __u64 size; - __u64 offset; - __u64 restored_offset; /* During restore, updated offset for BO */ - __u32 gpu_id; /* This is the user_gpu_id */ - __u32 alloc_flags; - __u32 dmabuf_fd; - __u32 pad; + uint64_t addr; + uint64_t size; + uint64_t offset; + uint64_t restored_offset; /* During restore, updated offset for BO */ + uint32_t gpu_id; /* This is the user_gpu_id */ + uint32_t alloc_flags; + uint32_t dmabuf_fd; + uint32_t pad; }; /* CRIU IOCTLs - END */ @@ -616,8 +616,8 @@ enum kfd_ioctl_svm_attr_type { * @value: attribute value */ struct kfd_ioctl_svm_attribute { - __u32 type; - __u32 value; + uint32_t type; + uint32_t value; }; /** @@ -659,10 +659,10 @@ struct kfd_ioctl_svm_attribute { * attribute type to indicate the access for the specified GPU. */ struct kfd_ioctl_svm_args { - __u64 start_addr; - __u64 size; - __u32 op; - __u32 nattr; + uint64_t start_addr; + uint64_t size; + uint32_t op; + uint32_t nattr; /* Variable length array of attributes */ struct kfd_ioctl_svm_attribute attrs[0]; }; diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile new file mode 100644 index 0000000000..cc3d98ac9d --- /dev/null +++ b/plugins/cuda/Makefile @@ -0,0 +1,40 @@ +PLUGIN_NAME := cuda_plugin +PLUGIN_SOBJ := cuda_plugin.so + +DEPS_CUDA := $(PLUGIN_SOBJ) + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ + +COMPEL := ../../compel/compel-host + +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC + +__nmk_dir ?= ../../scripts/nmk/scripts/ +include $(__nmk_dir)msg.mk + +all: $(DEPS_CUDA) + +cuda_plugin.so: cuda_plugin.c + $(call msg-gen, $@) + $(Q) $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) + +clean: + $(call msg-clean, $@) + $(Q) $(RM) $(PLUGIN_SOBJ) +.PHONY: clean + +mrproper: clean + +install: + $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) + $(E) " INSTALL " $(PLUGIN_NAME) + $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) +.PHONY: install + +uninstall: + $(E) " UNINSTALL" $(PLUGIN_NAME) + $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) +.PHONY: uninstall diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md new file mode 100644 index 0000000000..7b91f69989 --- /dev/null +++ b/plugins/cuda/README.md @@ -0,0 +1,59 @@ +Checkpoint and Restore for CUDA applications with CRIU +====================================================== + +# Requirements +The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555 +or higher GPU driver is required for CUDA CRIU integration support. + +## cuda-checkpoint +The cuda-checkpoint utility can be found at: +https://github.com/NVIDIA/cuda-checkpoint + +cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA +applications. Updating the cuda-checkpoint utility between driver releases +should not be necessary as the utility simply exposes some extra driver behavior +so driver updates are all that's needed to get access to newer features. + +# Checkpointing Procedure +cuda-checkpoint exposes 4 actions used in the checkpointing process: lock, +checkpoint, restore, unlock. + +* lock - Used with the PAUSE_DEVICES hook while a process is still running to + quiesce the application into a state where it can be checkpointed +* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been + seized/frozen to perform the actual checkpointing operation +* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA + state and release the process back to it's running state + +These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA +plugin will re-wake when needed. + +# Known Limitations +* Currently GPU memory contents are brought into main system memory and CRIU + then checkpoints that as part of the normal procedure. On systems with many + GPU's with high GPU memory usage this can cause memory thrashing. A future + CUDA release will add support for dumping the memory contents to files to + alleviate this as well as support in the CRIU plugin. +* There's currently a small race between when a PAUSE_DEVICES hook is called on + a running process and a process calls cuInit() and finishes initializing CUDA + after the PAUSE is issued but before the process is frozen to checkpoint. This + will cause cuda-checkpoint to report that the process is in an illegal state + for checkpointing and it's recommended to just attempt the CRIU procedure + again, this should be very rare. +* Applications that use NVML will leave some leftover device references as NVML + is not currently supported for checkpointing. There will be support for this + in later drivers. A possible temporary workaround is to have the + {DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N} + remaining references for these applications as in most cases NVML is used to + get info such as gpu count and some capabilities and these values are never + accessed again and unlikely to change. +* CUDA applications that fork() but don't call exec() but also don't issue any + CUDA API calls will have some leftover references to /dev/nvidia* and fail to + checkpoint as a result. This can be worked around in a similar fashion to the + NVML case where the leftover references can be ignored as CUDA is not fork() + safe anyway. +* Restore currently requires that you restore on a system with similar GPU's and + same GPU count. +* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process + Service) are currently not supported for checkpointing. Future CUDA releases + will add support for these. diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c new file mode 100644 index 0000000000..c4fc67fa9f --- /dev/null +++ b/plugins/cuda/cuda_plugin.c @@ -0,0 +1,538 @@ +#include "criu-log.h" +#include "plugin.h" +#include "util.h" +#include "cr_options.h" +#include "pid.h" +#include "proc_parse.h" +#include "seize.h" +#include "fault-injection.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +/* cuda-checkpoint binary should live in your PATH */ +#define CUDA_CHECKPOINT "cuda-checkpoint" + +/* cuda-checkpoint --action flags */ +#define ACTION_LOCK "lock" +#define ACTION_CHECKPOINT "checkpoint" +#define ACTION_RESTORE "restore" +#define ACTION_UNLOCK "unlock" + +#define CUDA_CKPT_BUF_SIZE (128) + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "cuda_plugin: " + +/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver + * version doesn't support --action flag + */ +bool plugin_disabled = false; + +bool plugin_added_to_inventory = false; + +struct pid_info { + int pid; + char checkpointed; + struct list_head list; +}; + +/* Used to track which PID's we've paused CUDA operations on so far so we can + * release them after we're done with the DUMP + */ +static LIST_HEAD(cuda_pids); + +static void dealloc_pid_buffer(struct list_head *pid_buf) +{ + struct pid_info *info; + struct pid_info *n; + + list_for_each_entry_safe(info, n, pid_buf, list) { + list_del(&info->list); + xfree(info); + } +} + +static int add_pid_to_buf(struct list_head *pid_buf, int pid) +{ + struct pid_info *new = xmalloc(sizeof(*new)); + + if (new == NULL) { + return -1; + } + + new->pid = pid; + new->checkpointed = 0; + list_add_tail(&new->list, pid_buf); + + return 0; +} + +static int update_checkpointed_pid(struct list_head *pid_buf, int pid) +{ + struct pid_info *info; + + list_for_each_entry(info, pid_buf, list) { + if (info->pid == pid) { + info->checkpointed = 1; + return 0; + } + } + + return -1; +} + +static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) +{ +#define READ 0 +#define WRITE 1 + int fd[2], buf_off; + + if (pipe(fd) != 0) { + pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + return -1; + } + + buf[0] = '\0'; + + int child_pid = fork(); + if (child_pid == -1) { + pr_err("Failed to fork to exec cuda-checkpoint\n"); + close(fd[READ]); + close(fd[WRITE]); + return -1; + } + + if (child_pid == 0) { // child + if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { + pr_perror("unable to clone fd %d->%d", fd[WRITE], STDOUT_FILENO); + _exit(EXIT_FAILURE); + } + if (dup2(fd[WRITE], STDERR_FILENO) == -1) { + pr_perror("unable to clone fd %d->%d", fd[WRITE], STDERR_FILENO); + _exit(EXIT_FAILURE); + } + close(fd[READ]); + + close_fds(STDERR_FILENO + 1); + + execvp(args[0], (char **)args); + + /* We can't use pr_error() as log file fd is closed. */ + fprintf(stderr, "execvp(\"%s\") failed: %s\n", args[0], strerror(errno)); + + _exit(EXIT_FAILURE); + } + + close(fd[WRITE]); + buf_off = 0; + /* Reserve one byte for the null charracter. */ + buf_size--; + while (buf_off < buf_size) { + int bytes_read; + bytes_read = read(fd[READ], buf + buf_off, buf_size - buf_off); + if (bytes_read == -1) { + pr_perror("Unable to read output of cuda-checkpoint"); + goto err; + } + if (bytes_read == 0) + break; + buf_off += bytes_read; + } + buf[buf_off] = '\0'; + + /* Clear out any of the remaining output in the pipe in case the buffer wasn't large enough */ + while (true) { + char scratch[1024]; + int bytes_read; + bytes_read = read(fd[READ], scratch, sizeof(scratch)); + if (bytes_read == -1) { + pr_perror("Unable to read output of cuda-checkpoint"); + goto err; + } + if (bytes_read == 0) + break; + } + close(fd[READ]); + + int status, exit_code = -1; + if (waitpid(child_pid, &status, 0) == -1) { + pr_perror("Unable to wait for the cuda-checkpoint process %d", child_pid); + goto err; + } + if (WIFSIGNALED(status)) { + int sig = WTERMSIG(status); + + pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); + } else if (WIFEXITED(status)) { + exit_code = WEXITSTATUS(status); + } else { + pr_err("cuda-checkpoint exited improperly: %u\n", status); + } + + if (exit_code != EXIT_SUCCESS) + pr_debug("cuda-checkpoint output ===>\n%s\n" + "<=== cuda-checkpoint output\n", + buf); + + return exit_code; +err: + kill(child_pid, SIGKILL); + waitpid(child_pid, NULL, 0); + return -1; +} + +/** + * Checks if a given flag is supported by the cuda-checkpoint utility + * + * Returns: + * 1 if the flag is supported, + * 0 if the flag is not supported, + * -1 if there was an error launching the cuda-checkpoint utility. + */ +static int cuda_checkpoint_supports_flag(const char *flag) +{ + char msg_buf[2048]; + const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; + + if (launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)) != 0) + return -1; + + if (strstr(msg_buf, flag) == NULL) + return 0; + + return 1; +} + +/* Retrieve the cuda restore thread TID from the root pid */ +static int get_cuda_restore_tid(int root_pid) +{ + char pid_buf[16]; + char pid_out[CUDA_CKPT_BUF_SIZE]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid); + + const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL }; + int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out)); + if (ret != 0) { + pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out); + return -1; + } + + return atoi(pid_out); +} + +static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, + int buf_size) +{ + char pid_buf[16]; + char timeout_buf[16]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */, + NULL /* timeout_val */, NULL }; + if (timeout > 0) { + snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout); + args[5] = "--timeout"; + args[6] = timeout_buf; + } + + return launch_cuda_checkpoint(args, msg_buf, buf_size); +} + +static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset) +{ + /* Since we resumed a thread that CRIU previously already froze we need to + * INTERRUPT it once again, task was already SEIZE'd so we don't need to do + * a compel_interrupt_task() + */ + if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { + pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", + restore_tid); + return -1; + } + + struct proc_status_creds creds; + if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) { + pr_err("compel_wait_task failed after interrupt\n"); + return -1; + } + + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { + pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + return -1; + } + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { + pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + return -1; + } + + return 0; +} + +static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) +{ + k_rtsigset_t block; + + if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { + pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + return -1; + } + + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { + pr_err("Failed to block signals on restore tid %d\n", restore_tid); + return -1; + } + + // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { + pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + return -1; + } + + if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { + pr_err("Could not resume cuda restore tid %d\n", restore_tid); + return -1; + } + + return 0; +} + +int cuda_plugin_checkpoint_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int int_ret; + int status; + k_rtsigset_t save_sigset; + + if (plugin_disabled) { + return -ENOTSUP; + } + + restore_tid = get_cuda_restore_tid(pid); + + /* We can possibly hit a race with cuInit() where we are past the point of + * locking the process but at lock time cuInit() hadn't completed in which + * case cuda-checkpoint will report that we're in an invalid state to + * checkpoint + */ + if (restore_tid == -1) { + pr_info("No need to checkpoint devices on pid %d\n", pid); + return 0; + } + + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); + /* We need to resume the checkpoint thread to prepare the mappings for + * checkpointing + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); + goto interrupt; + } + status = update_checkpointed_pid(&cuda_pids, pid); + if (status) { + pr_err("Failed to track checkpointed pid %d\n", pid); + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); + } + } + + if (!status && !plugin_added_to_inventory) { + status = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (status) + pr_err("Failed to add CUDA plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return status != 0 ? status : int_ret; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); + +int cuda_plugin_pause_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + + if (plugin_disabled) { + return -ENOTSUP; + } + + restore_tid = get_cuda_restore_tid(pid); + + if (restore_tid == -1) { + pr_info("no need to pause devices on pid %d\n", pid); + return 0; + } + + pr_info("pausing devices on pid %d\n", pid); + int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); + if (alarm_timeouted()) + goto unlock; + return -1; + } + + if (add_pid_to_buf(&cuda_pids, pid)) { + pr_err("unable to track paused pid %d\n", pid); + goto unlock; + } + + return 0; +unlock: + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); + } + return -1; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) + +int resume_device(int pid, int checkpointed) +{ + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int status; + int ret = 0; + int int_ret; + k_rtsigset_t save_sigset; + + int restore_tid = get_cuda_restore_tid(pid); + if (restore_tid == -1) { + pr_info("No need to resume devices on pid %d\n", pid); + return 0; + } + + pr_info("resuming devices on pid %d\n", pid); + /* The resuming process has to stay frozen during this time otherwise + * attempting to access a UVM pointer will crash if we haven't restored the + * underlying mappings yet + */ + pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid); + /* wakeup the restore thread so we can handle the restore for this pid, + * rseq_cs has to be restored before execution + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + + if (checkpointed) { + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); + ret = -1; + goto interrupt; + } + } + + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); + ret = -1; + } + +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return ret != 0 ? ret : int_ret; +} + +int cuda_plugin_resume_devices_late(int pid) +{ + if (plugin_disabled) { + return -ENOTSUP; + } + + return resume_device(pid, 1); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) + +int cuda_plugin_init(int stage) +{ + int ret; + + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { + pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); + plugin_disabled = true; + return 0; + } + + ret = cuda_checkpoint_supports_flag("--action"); + if (ret == -1) { + pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT); + plugin_disabled = true; + return 0; + } + + if (ret == 0) { + pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); + plugin_disabled = true; + return 0; + } + + pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage); + + /* In the DUMP stage track all the PID's we've paused CUDA operations on to + * release them when we're done if the user requested the leave-running option + */ + if (stage == CR_PLUGIN_STAGE__DUMP) { + INIT_LIST_HEAD(&cuda_pids); + } + + dont_use_freeze_cgroup(); + + return 0; +} + +void cuda_plugin_fini(int stage, int ret) +{ + if (plugin_disabled) { + return; + } + + pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret); + + /* Release all the paused PID's at the end of the DUMP stage in case the + * user provides the -R (leave-running) flag or an error occurred + */ + if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { + struct pid_info *info; + list_for_each_entry(info, &cuda_pids, list) { + resume_device(info->pid, info->checkpointed); + } + } + if (stage == CR_PLUGIN_STAGE__DUMP) { + dealloc_pid_buffer(&cuda_pids); + } +} +CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini) diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index af1858ab58..329d7791de 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -23,6 +23,7 @@ RUN apk update && apk add \ python3 \ sudo \ libcap-utils \ + libdrm-dev \ util-linux COPY . /criu @@ -32,6 +33,7 @@ RUN make mrproper && date && make -j $(nproc) CC="$CC" && date RUN apk add \ ip6tables \ iptables \ + iptables-legacy \ nftables \ iproute2 \ tar \ @@ -39,13 +41,12 @@ RUN apk add \ go \ e2fsprogs \ py-yaml \ - py3-flake8 \ py3-importlib-metadata \ asciidoctor # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml +RUN pip3 install junit_xml --break-system-packages RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm index c0d181b039..c466a73d2d 100644 --- a/scripts/build/Dockerfile.amd-rocm +++ b/scripts/build/Dockerfile.amd-rocm @@ -55,7 +55,6 @@ RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-insta protobuf-compiler \ python-protobuf \ python3-minimal \ - python3-future \ python-ipaddress \ curl \ wget \ diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index f2bce1e5ba..4056514891 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -31,10 +31,10 @@ RUN pacman -Syu --noconfirm \ bash \ go \ python-yaml \ - flake8 \ asciidoctor \ python-junit-xml \ python-importlib-metadata \ + libdrm \ diffutils COPY . /criu diff --git a/scripts/build/Dockerfile.centos7 b/scripts/build/Dockerfile.centos7 deleted file mode 100644 index 21e70ff0eb..0000000000 --- a/scripts/build/Dockerfile.centos7 +++ /dev/null @@ -1,45 +0,0 @@ -FROM centos:7 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm -RUN yum install -y \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - protobuf-python \ - python \ - python-flake8 \ - python-ipaddress \ - python2-future \ - python2-junit_xml \ - python-yaml \ - python-six \ - sudo \ - tar \ - which \ - e2fsprogs \ - python2-pip \ - rubygem-asciidoctor - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index 488f95d650..a672123441 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -26,9 +26,7 @@ RUN yum install -y --allowerasing \ protobuf-c-devel \ protobuf-devel \ python3-devel \ - python3-flake8 \ python3-PyYAML \ - python3-future \ python3-protobuf \ python3-pip \ sudo \ diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 350102818b..0318f650f3 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -6,7 +6,6 @@ COPY scripts/ci/apt-install /bin/apt-install RUN apt-install protobuf-c-compiler \ libprotobuf-c-dev \ libaio-dev \ - python3-future \ libprotobuf-dev \ protobuf-compiler \ libcap-dev \ @@ -31,4 +30,3 @@ WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index a15038631c..13e9926424 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -21,8 +21,7 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ - python3-minimal \ - python3-future + python3-minimal COPY . /criu WORKDIR /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 23db14e8df..c2cf20a36b 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -6,7 +6,6 @@ COPY scripts/ci/apt-install /bin/apt-install RUN apt-install protobuf-c-compiler \ libprotobuf-c-dev \ libaio-dev \ - python3-future \ libprotobuf-dev \ protobuf-compiler \ libcap-dev \ diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 6a68cd1ca6..078372c38c 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -21,7 +21,8 @@ RUN apt-install \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} + libnl-route-3-dev:${DEBIAN_ARCH} \ + libdrm-dev:${DEBIAN_ARCH} ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ @@ -39,4 +40,10 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +# amdgpu_plugin with armv7 is not supported +RUN make mrproper && date && \ + make -j $(nproc) && \ + if [ "$SUBARCH" != "armv7" ]; then \ + make -j $(nproc) amdgpu_plugin; \ + fi && \ + make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index e0e72372d9..9b53a76aab 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -27,11 +27,9 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ - python-is-python3 \ python3-minimal \ python3-protobuf \ - python3-yaml \ - python3-future + python3-yaml COPY . /criu WORKDIR /criu diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 32fc2978a5..566b4c9160 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,4 +1,4 @@ -FROM ubuntu:focal +FROM ubuntu:24.04 COPY scripts/ci/apt-install /bin/apt-install diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 2c006ad873..bc4a59db1c 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,4 +1,4 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide centos7 armv7hf centos8 +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 30dd9ebeb8..9dc0190b37 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos7 centos8 archlinux +TARGETS := alpine fedora-rawhide centos8 archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME @@ -20,14 +20,6 @@ export CONTAINER_RUNTIME alpine: ZDTM_OPTS=-x zdtm/static/binfmt_misc -x zdtm/static/sched_policy00 -define DOCKER_JSON -{ - "storage-driver": "devicemapper" -} -endef - -export DOCKER_JSON - ifeq ($(GITHUB_ACTIONS),true) # GitHub Actions does not give us a real TTY and errors out with # 'the input device is not a TTY' if using '-t' @@ -47,34 +39,20 @@ else endif ifeq ($(CONTAINER_RUNTIME),podman) - # Just as Docker needs to use devicemapper Podman needs vfs - # as graphdriver as overlayfs does not support all test cases - STORAGE_DRIVER := vfs # Podman limits the number of processes in a container using cgroups. # Disable it as it breaks the thread-bomb test CONTAINER_OPTS += --pids-limit=0 endif -export STORAGE_DRIVER - -restart-docker: - if [ "$$UNAME" = "x86_64" ] && [ "$$CONTAINER_RUNTIME" = "docker" ]; then \ - echo "$$DOCKER_JSON" > /etc/docker/daemon.json; \ - cat /etc/docker/daemon.json; \ - systemctl status docker; \ - systemctl restart docker; \ - systemctl status docker; \ - fi - export ZDTM_OPTS -$(TARGETS): restart-docker +$(TARGETS): $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run --env-file docker.env $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh + $(CONTAINER_RUNTIME) run --env-file docker.env -v `pwd`/../../:/criu $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh -fedora-asan: restart-docker +fedora-asan: $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) + $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) -v `pwd`/../../:/criu criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) docker-test: ./docker-test.sh @@ -82,10 +60,7 @@ docker-test: podman-test: ./podman-test.sh -# overlayfs behaves differently on Ubuntu and breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 -# Switch to devicemapper -java-test: restart-docker +java-test: ./java-test.sh setup-vagrant: @@ -102,5 +77,18 @@ vagrant-fedora-non-root: setup-vagrant .PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root +check-commit: + ($(MAKE) -j $$(nproc) -C ../.. && \ + echo "Commit $$(git rev-parse --short HEAD) built successfully") || \ + (echo "Build failed for $$(git rev-list -n 1 --pretty HEAD)" && \ + exit 1) + +.PHONY: check-commit + +loongarch64-qemu-test: + ./loongarch64-qemu-test.sh + +.PHONY: loongarch64-qemu-test + %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/ci/apt-install b/scripts/ci/apt-install index 45aca13f40..676e0f7949 100755 --- a/scripts/ci/apt-install +++ b/scripts/ci/apt-install @@ -15,7 +15,7 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends "$@" && break + apt-get update -y && apt-get install -y --no-install-recommends "$@" && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index deeeca0b9d..8b72fa5f1a 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -4,6 +4,9 @@ set -x cat /proc/self/mountinfo +time make ASAN=1 -j 4 V=1 +time make -j4 -C test/zdtm V=1 + chmod 0777 test chmod 0777 test/zdtm/transition/ chmod 0777 test/zdtm/static diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index beb7da6da6..aaf443afdc 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,24 +2,6 @@ set -x -e -o pipefail -./apt-install \ - apt-transport-https \ - ca-certificates \ - curl \ - software-properties-common - -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - -add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable test" - -./apt-install docker-ce - -# shellcheck source=/dev/null -. /etc/lsb-release - # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json service docker restart @@ -28,6 +10,9 @@ CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf + export SKIP_CI_TEST=1 ./run-ci-tests.sh @@ -84,27 +69,25 @@ print_logs () { } declare -i max_restore_container_tries=3 -current_iteration= restore_container () { CHECKPOINT_NAME=$1 - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { + for i in $(seq $max_restore_container_tries); do + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log && break + # FIXME: There is a race condition in docker/containerd that causes # docker to occasionally fail when starting a container from a # checkpoint immediately after the checkpoint has been created. # https://github.com/moby/moby/issues/42900 - if [ "$current_iteration" -gt "$max_restore_container_tries" ]; then + if grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log; then + echo "Retry container restore: $i/$max_restore_container_tries" + sleep 1; + else print_logs fi - grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log && { - ((current_iteration+=1)) - echo "Retry container restore: $current_iteration" - sleep 1; - restore_container "$CHECKPOINT_NAME" - } || - print_logs - } && current_iteration=0 + + done } # Scenario: Create multiple containers and checkpoint and restore them once diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh new file mode 100755 index 0000000000..d5646468e8 --- /dev/null +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -x + +./apt-install \ + apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common \ + sshpass \ + openssh-client + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable test" + +./apt-install docker-ce + +# shellcheck source=/dev/null +. /etc/lsb-release + +# docker checkpoint and restore is an experimental feature +echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart + +docker info + +# run a loongarch64 vm + +PORT='2222' +USER='root' +PASSWORD='loongarch64' +NAME='vm' + +docker run \ + -d \ + --net host \ + --name $NAME \ + merore/archlinux-loongarch64 + +run() { + if [ -z "$1" ]; then + echo "Command cannot be empty." + exit 1 + fi + sshpass -p $PASSWORD ssh -o StrictHostKeyChecking=no -p $PORT $USER@127.0.0.1 "$1" +} + +# wait vm to start +while (! run "uname -a") +do + echo "Wait vm to start..." + sleep 1 +done +echo "The loongarch64 vm is started!" + +# Tar criu and send to vm +tar -cf criu.tar ../../../criu +sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -P $PORT criu.tar $USER@127.0.0.1:/root + +# build and test +run 'cd /root; tar -xf criu.tar' +run 'cd /root/criu; make -j4 && make -j4 -C test/zdtm' +run "cd /root/criu; ./test/zdtm.py run -t zdtm/static/maps02 -t zdtm/static/maps05 -t zdtm/static/maps06 -t zdtm/static/maps10 -t zdtm/static/maps_file_prot -t zdtm/static/memfd00 -t zdtm/transition/fork -t zdtm/transition/fork2 -t zdtm/transition/shmem -f h" diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 687acb8ff5..185783011d 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -17,10 +17,17 @@ mkdir -p /etc/criu echo "manage-cgroups ignore" > /etc/criu/runc.conf sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers.conf +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf + +cat /proc/self/mountinfo podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' +# Show criu logs in case of error +trap 'cat /var/lib/containers/storage/overlay-containers/*/userdata/*.log' EXIT + sleep 1 for i in $(seq 20); do echo "Test $i for podman container checkpoint" @@ -61,3 +68,5 @@ for i in $(seq 20); do podman ps -a rm -f /tmp/chkpt.tar.gz done + +trap 'echo PASS' EXIT \ No newline at end of file diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 7c62aaaa2c..09085c403b 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -18,13 +18,12 @@ dnf install -y \ libnet-devel \ libnl3-devel \ libbsd-devel \ + libselinux-utils \ make \ procps-ng \ protobuf-c-devel \ protobuf-devel \ - python3-flake8 \ python3-PyYAML \ - python3-future \ python3-protobuf \ python3-junit_xml \ python3-pip \ @@ -36,6 +35,7 @@ dnf install -y \ which \ e2fsprogs \ rubygem-asciidoctor \ + libdrm-devel \ kmod # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 229de97c1c..b472e954c2 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -4,9 +4,9 @@ set -x -e CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time flake8 libbsd-dev python3-yaml - libperl-dev pkg-config python3-future python3-protobuf - python3-pip python3-importlib-metadata python3-junit.xml) + libnl-route-3-dev time libbsd-dev python3-yaml + libperl-dev pkg-config python3-protobuf python3-pip + python3-importlib-metadata python3-junit.xml libdrm-dev) X86_64_PKGS=(gcc-multilib) @@ -58,10 +58,6 @@ ci_prep () { scripts/ci/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" - - # zdtm uses an unversioned python binary to run the tests. - # let's point python to python3 - ln -sf /usr/bin/python3 /usr/bin/python } test_stream() { @@ -260,11 +256,15 @@ if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi +make -C test/others/criu-ns/ run make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling +./test/zdtm.py run -t zdtm/static/maps00 --preload-libfault +./test/zdtm.py run -t zdtm/static/maps02 --preload-libfault + ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server @@ -291,14 +291,38 @@ ip net add test # Rootless tests # Check if cap_checkpoint_restore is supported and also if unshare -c is supported. -if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then +# +# Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312). +# Before v6.8-rc1~215^2~6, the kernel currently did not show correct device and +# inode numbers in /proc/pid/maps for stackable file systems. +skip=0 +findmnt -no FSTYPE / | grep overlay && { + ./criu/criu check --feature overlayfs_maps || skip=1 +} +unshare -c /bin/true || skip=1 +capsh --supports=cap_checkpoint_restore || skip=1 + +if [ "$skip" == 0 ]; then make -C test/zdtm/ cleanout rm -rf test/dump setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then + # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. + selinuxmode=$(getenforce) + if [ "$selinuxmode" != "Disabled" ]; then + setenforce Permissive + fi + + fi # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then + if [ "$selinuxmode" != "Disabled" ]; then + setenforce "$selinuxmode" + fi + fi setcap -r criu/criu else echo "Skipping unprivileged mode tests" @@ -319,6 +343,9 @@ make -C test/others/ns_ext run # config file parser and parameter testing make -C test/others/config-file run +# action script testing +make -C test/others/action-script run + # Skip all further tests when running with GCOV=1 # The one test which currently cannot handle GCOV testing is compel/test # Probably because the GCOV Makefile infrastructure does not exist in compel @@ -326,3 +353,15 @@ make -C test/others/config-file run # compel testing make -C compel/test + +# amdgpu and cuda plugin testing +make amdgpu_plugin +make -C plugins/amdgpu/ test_topology_remap +./plugins/amdgpu/test_topology_remap + +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled + +./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 5cc8424423..3904c51d22 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,9 +6,9 @@ set -e set -x -VAGRANT_VERSION=2.2.19 -FEDORA_VERSION=37 -FEDORA_BOX_VERSION=37.20221105.0 +VAGRANT_VERSION=2.4.1 +FEDORA_VERSION=40 +FEDORA_BOX_VERSION=40.20240414.0 setup() { if [ -n "$TRAVIS" ]; then @@ -19,7 +19,7 @@ setup() { # Tar up the git checkout to have vagrant rsync it to the VM tar cf criu.tar ../../../criu # Cirrus has problems with the following certificate. - wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_"$(uname -m)".deb -O /tmp/vagrant.deb && \ + wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ @@ -38,8 +38,8 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-flake8 python3-future python3-protobuf python3-importlib-metadata \ - python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel + protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ + rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default cat /proc/cmdline @@ -57,6 +57,11 @@ fedora-no-vdso() { } fedora-rawhide() { + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks + # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously + # installed this reboots the VM. + vagrant reload + ssh default uname -a # # Workaround the problem: # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected @@ -65,6 +70,10 @@ fedora-rawhide() { # ssh default 'sudo dnf remove -y crun || true' ssh default sudo dnf install -y podman runc + # Some tests in the container need selinux to be disabled. + # In the container it is not possible to change the state of selinux. + # Let's just disable it for this test run completely. + ssh default 'sudo setenforce Permissive' ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } diff --git a/scripts/criu-ns b/scripts/criu-ns index d51e7772c0..5950d7c50e 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -6,6 +6,7 @@ import sys import os import fcntl import termios +import time # constants for unshare CLONE_NEWNS = 0x00020000 @@ -70,7 +71,19 @@ def _wait_for_process_status(criu_pid): try: (pid, status) = os.wait() if pid == criu_pid: - return os.waitstatus_to_exitcode(status) + # The following code block is based on + # os.waitstatus_to_exitcode() introduced in Python 3.9 + # and we implement this for comparability with older + # versions of Python. + if os.WIFSIGNALED(status): + return os.WTERMSIG(status) + elif os.WIFEXITED(status): + return os.WEXITSTATUS(status) + elif os.WIFSTOPPED(status): + return os.WSTOPSIG(status) + else: + raise Exception("CRIU was terminated by an " + "unidentified reason") except OSError: return -251 @@ -80,8 +93,21 @@ def run_criu(args): Spawn CRIU binary """ print(sys.argv) - os.execlp('criu', *['criu'] + args) - raise OSError(errno.ENOENT, "No such command") + + if "--criu-binary" in args: + try: + opt_index = args.index("--criu-binary") + path = args[opt_index + 1] + del args[opt_index:opt_index + 2] + args.insert(0, "criu") + os.execv(path, args) + raise OSError(errno.ENOENT, "No such command") + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--criu-binary missing argument") + else: + args.insert(0, "criu") + os.execvp("criu", args) + raise OSError(errno.ENOENT, "No such command") # pidns_holder creates a process that is reparented to the init. @@ -110,8 +136,8 @@ def wrap_restore(): if '--restore-sibling' in restore_args: raise OSError(errno.EINVAL, "--restore-sibling is not supported") - # Unshare pid and mount namespaces - if _unshare(CLONE_NEWNS | CLONE_NEWPID) != 0: + # Unshare pid namespace + if _unshare(CLONE_NEWPID) != 0: _errno = ctypes.get_errno() raise OSError(_errno, errno.errorcode[_errno]) @@ -123,8 +149,32 @@ def wrap_restore(): restore_detached = True restore_args.remove('--restore-detached') + restore_pidfile = None + if '--pidfile' in restore_args: + try: + opt_index = restore_args.index('--pidfile') + restore_pidfile = restore_args[opt_index + 1] + del restore_args[opt_index:opt_index + 2] + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--pidfile missing argument") + + if not restore_pidfile.startswith('/'): + for base_dir_opt in ['--work-dir', '-W', '--images-dir', '-D']: + if base_dir_opt in restore_args: + try: + opt_index = restore_args.index(base_dir_opt) + restore_pidfile = os.path.join(restore_args[opt_index + 1], restore_pidfile) + break + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, base_dir_opt + " missing argument") + criu_pid = os.fork() if criu_pid == 0: + # Unshare mount namespace + if _unshare(CLONE_NEWNS) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + os.setsid() # Set stdin tty to be a controlling tty of our new session, this is # required by --shell-job option, as for it CRIU would try to set a @@ -139,6 +189,25 @@ def wrap_restore(): _mount_new_proc() run_criu(restore_args) + if restore_pidfile: + restored_pid = None + retry = 5 + + while not restored_pid and retry: + with open('/proc/%d/task/%d/children' % (criu_pid, criu_pid)) as f: + line = f.readline().strip() + if len(line): + restored_pid = line + break + retry -= 1 + time.sleep(1) + + if restored_pid: + with open(restore_pidfile, 'w+') as f: + f.write(restored_pid) + else: + print("Warn: Search of restored pid for --pidfile option timeouted") + if restore_detached: return 0 @@ -147,7 +216,7 @@ def wrap_restore(): def get_varg(args): for i in range(1, len(sys.argv)): - if not sys.argv[i] in args: + if sys.argv[i] not in args: continue if i + 1 >= len(sys.argv): diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index b80175f05b..5b6037d619 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -8,7 +8,7 @@ URL="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/.c curl -s "${URL}" | sed -e " s,^\( *\)#\([A-Z]\),\1\2,g; s,ControlStatements,ControlStatementsExceptForEachMacros,g; - s,ColumnLimit: 80,ColumnLimit: 120,g; + s,ColumnLimit: 80,ColumnLimit: 0,g; s,Intended for clang-format >= 4,Intended for clang-format >= 11,g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_bit',g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; diff --git a/scripts/github-indent-warnings.py b/scripts/github-indent-warnings.py new file mode 100755 index 0000000000..04f82d6c11 --- /dev/null +++ b/scripts/github-indent-warnings.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +import sys +import re + +re_file = r'^diff --git a/(\S\S*)\s.*$' +re_line = r'^@@ -(\d\d*)\D.*@@.*$' + +if __name__ == '__main__': + if len(sys.argv) != 1 and len(sys.argv) != 2: + print(f'usage: {sys.argv[0]} ') + print(f'usage: | {sys.argv[0]}') + exit(1) + + input_file = sys.stdin.fileno() + if len(sys.argv) == 2: + input_file = sys.argv[1] + + with open(input_file, 'r') as fi: + file_name = None + line_number = None + for line in fi: + file_matches = re.findall(re_file, line) + if len(file_matches) == 1: + file_name = file_matches[0] + continue + + if file_name is None: + continue + + line_matches = re.findall(re_line, line) + if len(line_matches) == 1: + line_number = int(line_matches[0]) + 3 + print(f'::warning file={file_name},line={line_number}::clang-format: Possible coding style problem (https://github.com/checkpoint-restore/criu/blob/criu-dev/CONTRIBUTING.md#automatic-tools-to-fix-coding-style)') diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh index 540c2c0949..8be49c7871 100755 --- a/scripts/install-debian-pkgs.sh +++ b/scripts/install-debian-pkgs.sh @@ -15,7 +15,7 @@ function print_help() function process() { sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' ${REQ_PKGS} )" + sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" } if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then diff --git a/scripts/magic-gen.py b/scripts/magic-gen.py index 3b1f29fb52..38dff1424a 100755 --- a/scripts/magic-gen.py +++ b/scripts/magic-gen.py @@ -1,4 +1,4 @@ -#!/bin/env python2 +#!/bin/env python3 import sys diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index c1c1e94af4..55c5be307f 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -20,7 +20,8 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/ppc64.*/ppc64/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ - -e s/aarch64.*/aarch64/) + -e s/aarch64.*/aarch64/ \ + -e s/loongarch64.*/loongarch64/) export SUBARCH ARCH diff --git a/scripts/nmk/scripts/main.mk b/scripts/nmk/scripts/main.mk index 493a164f88..7f11bda236 100644 --- a/scripts/nmk/scripts/main.mk +++ b/scripts/nmk/scripts/main.mk @@ -1,7 +1,7 @@ ifndef ____nmk_defined__main # -# Genaral inclusion statement +# General inclusion statement ifndef ____nmk_defined__include include $(__nmk_dir)include.mk diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 1681d4e909..724204a03c 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null || which python2 2>/dev/null) +FULL_PYTHON := $(shell which python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ diff --git a/scripts/ruff.toml b/scripts/ruff.toml new file mode 100644 index 0000000000..2b0385976e --- /dev/null +++ b/scripts/ruff.toml @@ -0,0 +1,4 @@ +# Ignore `E401` (import violations) in all `__init__.py` files +[lint.per-file-ignores] +"__init__.py" = ["F401"] + diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py index 439fca18a1..8a9b70892b 100755 --- a/scripts/uninstall_module.py +++ b/scripts/uninstall_module.py @@ -38,8 +38,9 @@ def uninstall_module(package_name: str, prefix=None): if prefix: add_site_dir(prefix) try: - dist_info_path = str(importlib_metadata.distribution(package_name)._path) - except importlib_metadata.PackageNotFoundError: + distribution = next(importlib_metadata.Distribution.discover(name=package_name)) + dist_info_path = str(distribution._path) + except StopIteration: print(f"Skipping {package_name} as it is not installed.") sys.exit(0) diff --git a/soccr/soccr.c b/soccr/soccr.c index abea937033..8e1ce1c633 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -781,7 +781,7 @@ int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsi return 0; } -static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) +static int __send_queue(struct libsoccr_sk *sk, const char *queue, char *buf, __u32 len) { int ret, err = -1, max_chunk; int off; @@ -816,7 +816,7 @@ static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) continue; } - logerr("Can't restore %d queue data (%d), want (%d:%d:%d)", queue, ret, chunk, len, max_chunk); + logerr("Can't restore %s queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); goto err; } off += ret; @@ -837,7 +837,7 @@ static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) return -1; } - return __send_queue(sk, queue, buf, len); + return __send_queue(sk, queue == TCP_RECV_QUEUE ? "recv" : "send", buf, len); } static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue, @@ -876,7 +876,7 @@ static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_dat * they can be restored without any tricks. */ tcp_repair_off(sk->fd); - if (__send_queue(sk, TCP_SEND_QUEUE, buf + len, ulen)) + if (__send_queue(sk, "not-sent send", buf + len, ulen)) return -3; if (tcp_repair_on(sk->fd)) return -4; diff --git a/soccr/test/Makefile b/soccr/test/Makefile index 4585400454..499901b0c5 100644 --- a/soccr/test/Makefile +++ b/soccr/test/Makefile @@ -21,7 +21,6 @@ tcp-conn-v6: tcp-conn-v6.c test: tcp-constructor tcp-conn tcp-conn-v6 unshare -n sh -c "ip link set up dev lo; ./tcp-conn" unshare -n sh -c "ip link set up dev lo; ./tcp-conn-v6" - python run.py ./$(RUN) + python3 run.py ./$(RUN) .PHONY: test - diff --git a/soccr/test/run.py b/soccr/test/run.py index 1ffe58a582..57c556e361 100644 --- a/soccr/test/run.py +++ b/soccr/test/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys, os import hashlib diff --git a/soccr/test/tcp-test.py b/soccr/test/tcp-test.py index ff3fe29dc2..b48f532eb0 100755 --- a/soccr/test/tcp-test.py +++ b/soccr/test/tcp-test.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 -from __future__ import print_function import sys, socket import hashlib diff --git a/test/Makefile b/test/Makefile index e8fcffe3fc..0bfdab6802 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,7 +12,7 @@ all: $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job skip-file-rwx-check +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job criu-ns skip-file-rwx-check other: for t in $(TESTS); do \ @@ -45,10 +45,6 @@ zdtm-freezer: ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:f .PHONY: zdtm-freezer -fault-injection: - $(MAKE) -C fault-injection -.PHONY: fault-injection - override CFLAGS += -D_GNU_SOURCE clean_root: diff --git a/test/check_actions.py b/test/check_actions.py index 4973e39382..84d738dbb7 100755 --- a/test/check_actions.py +++ b/test/check_actions.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/crit-recode.py b/test/crit-recode.py index 4135681e11..f119271d8b 100755 --- a/test/crit-recode.py +++ b/test/crit-recode.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import pycriu import sys import os diff --git a/test/cuda-checkpoint/.gitignore b/test/cuda-checkpoint/.gitignore new file mode 100644 index 0000000000..717fb70286 --- /dev/null +++ b/test/cuda-checkpoint/.gitignore @@ -0,0 +1 @@ +cuda-checkpoint diff --git a/test/cuda-checkpoint/Makefile b/test/cuda-checkpoint/Makefile new file mode 100644 index 0000000000..c59dadddc7 --- /dev/null +++ b/test/cuda-checkpoint/Makefile @@ -0,0 +1,17 @@ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) + +BIN := cuda-checkpoint +SRC := cuda-checkpoint.c +DEP := $(SRC:%.c=%.d) +OBJ := $(SRC:%.c=%.o) +TARGETS := $(BIN) + +include ../zdtm/Makefile.inc + +all: $(TARGETS) +.PHONY: all + +clean-more: + $(RM) $(TARGETS) +.PHONY: clean-more +clean: clean-more diff --git a/test/cuda-checkpoint/cuda-checkpoint.c b/test/cuda-checkpoint/cuda-checkpoint.c new file mode 100644 index 0000000000..f35a4b41df --- /dev/null +++ b/test/cuda-checkpoint/cuda-checkpoint.c @@ -0,0 +1,53 @@ +/* The mocked version of cuda-checkpoint. */ +#include +#include +#include + +int main(int argc, char *argv[]) +{ + int c; + + while (1) { + int option_index = 0; + static struct option long_options[] = { + { "pid", required_argument, 0, 'p' }, + { "get-restore-tid", no_argument, 0, 'g' }, + { "action", required_argument, 0, 'a' }, + { "timeout", required_argument, 0, 't' }, + { "help", no_argument, 0, 'h' }, + { 0, 0, 0, 0 } + }; + + c = getopt_long(argc, argv, "p:ga:ht:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'p': + printf("%s\n", optarg); + break; + case 'g': + case 'a': + case 't': + break; + case 'h': + printf("--action - execute an action"); + break; + + default: + fprintf(stderr, "getopt returned character code 0%o ??\n", c); + return 1; + } + } + + if (optind < argc) { + fprintf(stderr, "non-option ARGV-elements: "); + while (optind < argc) + fprintf(stderr, "%s ", argv[optind++]); + fprintf(stderr, "\n"); + return 1; + } + + return 0; +} diff --git a/test/exhaustive/pipe.py b/test/exhaustive/pipe.py index 7f1c53d34b..afe20846a3 100755 --- a/test/exhaustive/pipe.py +++ b/test/exhaustive/pipe.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import os diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 5b4c972cb6..689b1fb3ae 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os @@ -462,7 +462,7 @@ def set_nonblock(sk): def chk_real_state(st): - # Before enything else -- check that we still have + # Before anything else -- check that we still have # all the sockets at hands for sk in st.sockets: if not sk.visible: diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 252778969d..27e2b7b155 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import ctypes libc = ctypes.CDLL(None) diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 7f503e817a..fc0eddc2b2 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -9,7 +9,7 @@ prep ./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --report report -f h || fail # FIXME: fhandles looks broken on btrfs -grep -P "/.* / " /proc/self/mountinfo | grep -q btrfs || NOBTRFS=$? +findmnt --noheadings --target . | grep -q btrfs || NOBTRFS=$? if [ $NOBTRFS -eq 1 ] ; then ./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --pre 2 -f uns || fail fi @@ -39,3 +39,11 @@ fi ./test/zdtm.py run -t zdtm/static/fpu03 --fault 134 -f h --norst || fail # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail + +# check dont_use_freeze_cgroup +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst + +if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then + fail +fi diff --git a/test/libfault/Makefile b/test/libfault/Makefile new file mode 100644 index 0000000000..cbe47fdf24 --- /dev/null +++ b/test/libfault/Makefile @@ -0,0 +1,21 @@ +CC = gcc +CFLAGS = -c -fPIC -ldl + +SRC = libfault.c +OBJ = $(SRC:.c=.o) + +LIB = libfault.so + +.PHONY: all clean run + +all: $(LIB) + +$(LIB): $(OBJ) + $(CC) -shared -o $(LIB) $(OBJ) + +$(OBJ): $(SRC) + $(CC) $(CFLAGS) $< + +clean: + rm -f $(OBJ) $(LIB) + diff --git a/test/libfault/libfault.c b/test/libfault/libfault.c new file mode 100644 index 0000000000..650bf08ca0 --- /dev/null +++ b/test/libfault/libfault.c @@ -0,0 +1,31 @@ +#define _GNU_SOURCE +#include +#include +#include + +ssize_t (*original_pread)(int fd, void *buf, size_t count, off_t offset) = NULL; + +/** + * This function is a wrapper around pread() that is used for testing CRIU's + * handling of cases where pread() returns less data than requested. + * + * pmc_fill() in criu/pagemap.c is a good example of where this can happen. + */ +ssize_t pread64(int fd, void *buf, size_t count, off_t offset) +{ + if (!original_pread) { + original_pread = dlsym(RTLD_NEXT, "pread"); + if (!original_pread) { + errno = EIO; + return -1; + } + } + + /* The following aims to simulate the case when pread() returns less + * data than requested. We need to ensure that CRIU handles such cases. */ + if (count > 2048) { + count -= 1024; + } + + return original_pread(fd, buf, count, offset); +} diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore new file mode 100644 index 0000000000..c0b6a2490a --- /dev/null +++ b/test/others/action-script/.gitignore @@ -0,0 +1 @@ +img-dir-* diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile new file mode 100644 index 0000000000..f1ce191dbc --- /dev/null +++ b/test/others/action-script/Makefile @@ -0,0 +1,5 @@ +run: + @make -C .. loop + ./run.sh + +.PHONY: run diff --git a/test/others/action-script/action-script.sh b/test/others/action-script/action-script.sh new file mode 100755 index 0000000000..aba8292c05 --- /dev/null +++ b/test/others/action-script/action-script.sh @@ -0,0 +1,2 @@ +#!/bin/bash +touch action-hook-"$CRTOOLS_SCRIPT_ACTION" diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh new file mode 100755 index 0000000000..a82fccf359 --- /dev/null +++ b/test/others/action-script/run.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -ebm + +# shellcheck source=test/others/env.sh +source ../env.sh || exit 1 + +SELFDIR="$(dirname "$(readlink -f "$0")")" +SCRIPT="$SELFDIR/action-script.sh" +IMGDIR="$SELFDIR/img-dir-$$" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +trap "cleanup" QUIT TERM INT HUP EXIT + +# shellcheck disable=SC2317 +# https://github.com/koalaman/shellcheck/issues/2660 +function cleanup() +{ + if [[ -n "$PID" ]]; then + kill -9 "$PID" + fi +} + +PID=$(../loop) +if ! $CRIU dump -v4 -o dump.log -t "$PID" -D "$IMGDIR" --action-script "$SCRIPT"; then + echo "Failed to checkpoint process $PID" + cat dump.log + kill -9 "$PID" + exit 1 +fi + +if ! $CRIU restore -v4 -o restore.log -D "$IMGDIR" -d --pidfile test.pidfile --action-script "$SCRIPT"; then + echo "CRIU restore failed" + echo FAIL + exit 1 +fi + +PID=$(cat "$IMGDIR"/test.pidfile) + +found_missing_file=false +hooks=("pre-dump" "post-dump" "pre-restore" "pre-resume" "post-restore" "post-resume") + +for hook in "${hooks[@]}" +do + if [ ! -e "$IMGDIR/action-hook-$hook" ]; then + echo "ERROR: action-hook-$hook does not exist" + found_missing_file=true + fi +done + +if [ "$found_missing_file" = true ]; then + exit 1 +fi + +echo PASS + +rm -rf "$IMGDIR" +exit 0 diff --git a/test/others/app-emu/java/HelloWorld/run.sh b/test/others/app-emu/java/HelloWorld/run.sh index 0ed6afd141..e6dcbd9fca 100644 --- a/test/others/app-emu/java/HelloWorld/run.sh +++ b/test/others/app-emu/java/HelloWorld/run.sh @@ -18,7 +18,7 @@ setsid java HelloWorld & pid=${!} -echo Lanuched java application with pid $pid in background +echo Launched java application with pid $pid in background ${criu} dump -D dump -o dump.log -v4 --shell-job -t ${pid} || { echo "Dump failed" diff --git a/test/others/app-emu/make/run.sh b/test/others/app-emu/make/run.sh index 7cb44c7709..d871b7d9c4 100644 --- a/test/others/app-emu/make/run.sh +++ b/test/others/app-emu/make/run.sh @@ -28,7 +28,7 @@ setsid make -j4 & pid=${!} -echo Lanuched make in $pid background +echo Launched make in $pid background sleep 2 ${criu} dump --shell-job -D dump -o dump.log -v4 -t ${pid} || { diff --git a/test/others/bers/bers.c b/test/others/bers/bers.c index 37cf84dd3d..b291e3bcbe 100644 --- a/test/others/bers/bers.c +++ b/test/others/bers/bers.c @@ -391,7 +391,7 @@ int main(int argc, char *argv[]) pr_msg(" -f|--files create files for each task\n"); pr_msg(" -m|--memory allocate megabytes for each task\n"); pr_msg(" --memory-chunks split memory to equal parts\n"); - pr_msg(" --mem-fill fill memory with data dependin on :\n"); + pr_msg(" --mem-fill fill memory with data depending on :\n"); pr_msg(" all fill every byte of memory\n"); pr_msg(" light fill first bytes of every page\n"); pr_msg(" dirtify fill every page\n"); diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 105aac72b4..2698bbd3c2 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -101,6 +101,8 @@ function run_test2 { ${CRIT} x ./ rss || exit 1 } +${CRIT} --version + gen_imgs run_test1 run_test2 diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index eec2b817f4..4399044d71 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -43,5 +43,13 @@ function run_test { echo "= done" } +UNAME_M=$(uname -m) + +if [ "$UNAME_M" != "x86_64" ]; then + # the criu-coredump script is only x86_64 aware + echo "criu-coredump only support x86_64. skipping." + exit 0 +fi + gen_imgs run_test diff --git a/test/others/criu-ns/Makefile b/test/others/criu-ns/Makefile new file mode 100644 index 0000000000..4d901a1116 --- /dev/null +++ b/test/others/criu-ns/Makefile @@ -0,0 +1,3 @@ +run: + @make -C ../.. zdtm_ct + ../../zdtm_ct run.py diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py new file mode 100755 index 0000000000..0a36438e80 --- /dev/null +++ b/test/others/criu-ns/run.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 + +import fcntl +import os +import pathlib +import pty +import shutil +import subprocess +import sys +import termios +import time + + +CRIU_BIN = "../../../criu/criu" +CRIU_NS = "../../../scripts/criu-ns" +IMG_DIR = "dumpdir" +DUMP_LOG = "dump.log" +RESTORE_LOG = "restore.log" +PIDFILE = "pidfile" + + +def check_dumpdir(path=IMG_DIR): + if os.path.isdir(path): + shutil.rmtree(path) + os.mkdir(path, 0o755) + + +def run_task_with_own_pty(task): + fd_m, fd_s = pty.openpty() + + pid = os.fork() + if pid == 0: + os.close(fd_m) + os.setsid() + os.dup2(fd_s, 0) + os.dup2(fd_s, 1) + os.dup2(fd_s, 2) + fcntl.ioctl(fd_s, termios.TIOCSCTTY, 1) + os.close(fd_s) + task() + exit(0) + + os.close(fd_s) + fd_m = os.fdopen(fd_m, "rb") + os.set_blocking(fd_m.fileno(), False) + + while True: + try: + data = fd_m.read() + except IOError: + break + if data is not None: + print(data.decode("utf-8")) + + _, status = os.waitpid(pid, 0) + + try: + data = fd_m.read() + except IOError as err: + print(err) + + if data is not None: + print(data.decode("utf-8")) + fd_m.close() + + if status != 0: + print("task %s exited badly: %d" % (task.__name__, status)) + exit(1) + + return 0 + + +def create_pty(): + fd_m, fd_s = pty.openpty() + return (os.fdopen(fd_m, "wb"), os.fdopen(fd_s, "wb")) + + +def create_isolated_dumpee(): + pathlib.Path("running").touch() + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + os.dup2(fd_s.fileno(), 0) + os.dup2(fd_s.fileno(), 1) + os.dup2(fd_s.fileno(), 2) + fcntl.ioctl(fd_s.fileno(), termios.TIOCSCTTY, 1) + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + fd_m.close() + fd_s.close() + return pid + + +def criu_ns_dump(pid, shell_job=False): + cmd = [CRIU_NS, "dump", "-D", IMG_DIR, "-v4", "-t", str(pid), + "--log-file", DUMP_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + ret = subprocess.Popen(cmd).wait() + return ret + + +def criu_ns_restore(shell_job=False, restore_detached=False): + cmd = [CRIU_NS, "restore", "-D", IMG_DIR, "-v4", "--log-file", + RESTORE_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + if restore_detached: + cmd += ["--restore-detached", "--pidfile", PIDFILE] + ret = subprocess.Popen(cmd).wait() + return ret + + +def read_log_file(filename): + logfile_path = os.path.join(IMG_DIR, filename) + with open(logfile_path) as logfile: + print(logfile.read()) + + +def test_dump_and_restore_with_shell_job(): + print("Test criu-ns dump and restore with --shell-job option") + check_dumpdir() + pathlib.Path("running").touch() + pid = os.fork() + if pid == 0: + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + + ret = criu_ns_dump(pid, shell_job=True) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + os.unlink("running") + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + fd_m.close() + # since criu-ns takes control of the tty stdin + os.dup2(fd_s.fileno(), 0) + ret = criu_ns_restore(shell_job=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + fd_s.close() + os.waitpid(pid, 0) + + +def test_dump_and_restore_without_shell_job(restore_detached=False): + print("Test criu-ns dump and restore with an isolated process" + "(%d)" % restore_detached) + check_dumpdir() + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + if not restore_detached: + os.unlink("running") + + pid = os.fork() + if pid == 0: + os.setsid() + ret = criu_ns_restore(restore_detached=restore_detached) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + os.waitpid(pid, 0) + + +def test_dump_and_restore_in_pidns(): + if os.system("grep NSpid /proc/self/status"): + return + + print("Test criu-ns dump and restore in namespaces") + + def _dump(): + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _restore(): + ret = criu_ns_restore(restore_detached=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + def _get_restored_pid(): + restored_pid = 0 + pidfile_path = os.path.join(IMG_DIR, PIDFILE) + if not os.path.exists(pidfile_path): + raise FileNotFoundError("pidfile not found") + with open(pidfile_path, "r") as pidfile: + restored_pid = pidfile.read().strip() + return int(restored_pid) + + def _redump(): + global IMG_DIR + try: + restored_pid = _get_restored_pid() + except FileNotFoundError: + sys.exit(1) + IMG_DIR = "dumpdir2" + check_dumpdir(IMG_DIR) + ret = criu_ns_dump(restored_pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _re_restore(): + os.unlink("running") + ret = criu_ns_restore() + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + check_dumpdir() + _dump() + _restore() + _redump() + _re_restore() + + +def main(): + test_dump_and_restore_with_shell_job() + test_dump_and_restore_without_shell_job() + test_dump_and_restore_without_shell_job(restore_detached=True) + test_dump_and_restore_in_pidns() + + +if __name__ == "__main__": + run_task_with_own_pty(main) diff --git a/test/others/env.sh b/test/others/env.sh index 45066f760b..6fa2c9691b 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -1,17 +1,13 @@ #!/bin/sh -CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) +BASE_DIR="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/../../")" + +CRIU="${BASE_DIR}/criu/criu" criu=$CRIU -if [ $(which python3) ]; then - PYTHON=python3 -elif [ $(which python2) ]; then - PYTHON=python2 -else - echo "FAIL: Neither python3 nor python2" - exit 1 -fi -#export PYTHON -CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") + +export PYTHONPATH="${BASE_DIR}/lib:${BASE_DIR}/crit:${PYTHONPATH-}" +CRIT="python3 -m crit" crit=$CRIT -CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump-"${PYTHON}") + +CRIU_COREDUMP="${BASE_DIR}/coredump/coredump" criu_coredump=$CRIU_COREDUMP diff --git a/test/others/ext-tty/run.py b/test/others/ext-tty/run.py index 8109033cb9..2c268a2c8f 100755 --- a/test/others/ext-tty/run.py +++ b/test/others/ext-tty/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import os, sys, time, signal, pty diff --git a/test/others/mnt-ext-dev/run.sh b/test/others/mnt-ext-dev/run.sh index 5a1f44450a..5cdbc45a82 100755 --- a/test/others/mnt-ext-dev/run.sh +++ b/test/others/mnt-ext-dev/run.sh @@ -2,16 +2,14 @@ set -e -x # construct root -python ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns +python3 ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop mkfs.ext4 -F zdtm.loop dev=`losetup --find --show zdtm.loop` -mkdir -p ../../dev -cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev -python ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? +python3 ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev unlink zdtm.loop exit $ret diff --git a/test/others/mounts/mounts.sh b/test/others/mounts/mounts.sh index 51ea69540d..bed156a50c 100755 --- a/test/others/mounts/mounts.sh +++ b/test/others/mounts/mounts.sh @@ -20,7 +20,7 @@ for i in `awk '{ print $2 }' < /proc/self/mounts`; do umount -l $i done -python mounts.py +python3 mounts.py kill $INMNTNS_PID while :; do sleep 10 diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index e416f95e53..5d1e139d73 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -2,10 +2,13 @@ set -x +if ! ../../zdtm/static/macvlan.checkskip; then + echo "No macvlan support. Skipping" + exit 0 +fi + if [[ "$1" == "pid" ]]; then NS=pid - # CentOS 7 kernels do not have NSpid -> skip this test - grep NSpid /proc/self/status || exit 0 else NS=net fi diff --git a/test/others/ns_ext/run_pidns.sh b/test/others/ns_ext/run_pidns.sh index 08c5bff8e8..db12106e03 100755 --- a/test/others/ns_ext/run_pidns.sh +++ b/test/others/ns_ext/run_pidns.sh @@ -2,9 +2,6 @@ set -e -# CentOS 7 kernels do not have NSpid -> skip this test -grep NSpid /proc/self/status || exit 0 - # This test creates a process in non-host pidns and then dumps it and restores # it into host pidns. We use pid >100000 in non-host pidns to make sure it does # not intersect with some host pid on restore but it is potentially racy so diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index fc64f0c977..69537bb0d3 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -4,7 +4,7 @@ all: test-c rpc_pb2.py criu CFLAGS += -g -Werror -Wall -I. LDLIBS += -lprotobuf-c -PYTHON ?= python +PYTHON ?= python3 run: all @make -C .. loop diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 90c80fcaea..6cffe270d0 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import argparse import os diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index f84757efd6..b600b6d1c4 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Test criu errno import socket, os, errno diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index b51357d426..daeda49bce 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys, errno import rpc_pb2 as rpc diff --git a/test/others/rpc/restore-loop.py b/test/others/rpc/restore-loop.py index 84a2ce56d1..67110c2cf5 100755 --- a/test/others/rpc/restore-loop.py +++ b/test/others/rpc/restore-loop.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index 9be5775872..afd4fb5e33 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -1,14 +1,6 @@ #!/bin/bash -set -ex - -if [ -e /etc/os-release ]; then - . /etc/os-release - if [ "$ID" == "centos" ] && [[ "$VERSION_ID" == "7"* ]];then - echo "Skipping tests on CentOS 7 because they do not work in CI" - exit 0 - fi -fi +set -e CRIU=./criu diff --git a/test/others/rpc/setup_swrk.py b/test/others/rpc/setup_swrk.py index c7f84f952a..ffaa01de42 100644 --- a/test/others/rpc/setup_swrk.py +++ b/test/others/rpc/setup_swrk.py @@ -5,12 +5,6 @@ def setup_swrk(): print('Connecting to CRIU in swrk mode.') s1, s2 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) - - kwargs = {} - if sys.version_info.major == 3: - kwargs["pass_fds"] = [s1.fileno()] - - swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], **kwargs) + swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], pass_fds=[s1.fileno()]) s1.close() return swrk, s2 - diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index 80f6338f45..ce8411bc60 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/version.py b/test/others/rpc/version.py index 9d7fa745b5..a18cd5b7b7 100755 --- a/test/others/rpc/version.py +++ b/test/others/rpc/version.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import rpc_pb2 as rpc diff --git a/test/others/shell-job/run.py b/test/others/shell-job/run.py index a59945d6a7..969965f00f 100755 --- a/test/others/shell-job/run.py +++ b/test/others/shell-job/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os, pty, sys, subprocess import termios, fcntl, time diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh index 0803d78eca..0776ebf618 100755 --- a/test/others/skip-file-rwx-check/run.sh +++ b/test/others/skip-file-rwx-check/run.sh @@ -10,11 +10,11 @@ source ../env.sh make clean touch testfile chmod +w testfile -tail --follow testfile & -tailpid=$! -if ! "$criu" dump --tree=$tailpid --shell-job --verbosity=4 --log-file=dump.log +bash -c 'exec 3= 5: - import importlib.util - spec = importlib.util.spec_from_file_location(name, path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - else: - import imp - mod = imp.load_source(name, path) + import importlib.util + spec = importlib.util.spec_from_file_location(name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) return mod @@ -828,7 +884,7 @@ def __init__(self, name, desc, flavor, freezer, rootless): self._bins += self.__subs self._deps += get_test_desc('zdtm/lib/groups')['deps'] - self._env = {'ZDTM_TESTS': self.__real_name} + self._env['ZDTM_TESTS'] = self.__real_name def __get_start_cmd(self, name): tdir = os.path.dirname(name) @@ -838,7 +894,7 @@ def __get_start_cmd(self, name): subprocess.check_call(s_args + [tname + '.cleanout']) s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout=subprocess.PIPE) - out, _ = s.communicate() + out, _ = s.communicate(timeout=self.__timeout) cmd = out.decode().splitlines()[-1].strip() return 'cd /' + tdir + ' && ' + cmd @@ -882,15 +938,22 @@ def run(action, fault=None, strace=[], preexec=None, - nowait=False): + preload_libfault=False, + nowait=False, + timeout=60): env = dict( os.environ, - ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0") + ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0", + CRIU_LIBS_DIR=PLUGINS_DIR + ) if fault: print("Forcing %s fault" % fault) env['CRIU_FAULT'] = fault + if preload_libfault: + env['LD_PRELOAD'] = LIBFAULT_PATH + cr = subprocess.Popen(strace + [criu_bin, action, "--no-default-config"] + args, env=env, @@ -898,7 +961,11 @@ def run(action, preexec_fn=preexec) if nowait: return cr - return cr.wait() + return cr.wait(timeout=timeout) + + @staticmethod + def exit_signal(ret): + return ret < 0 class criu_rpc_process: @@ -981,7 +1048,9 @@ def run(action, fault=None, strace=[], preexec=None, - nowait=False): + preload_libfault=False, + nowait=False, + timeout=None): if fault: raise test_fail_exc('RPC and FAULT not supported') if strace: @@ -1018,8 +1087,11 @@ def run(action, else: raise test_fail_exc('RPC for %s required' % action) except crpc.CRIUExceptionExternal as e: - print("Fail", e) - ret = -1 + if e.typ != e.resp_typ: + ret = -2 + else: + print("Fail", e) + ret = -1 else: ret = 0 @@ -1032,6 +1104,10 @@ def run(action, return ret + @staticmethod + def exit_signal(ret): + return ret == -2 + class criu: def __init__(self, opts): @@ -1065,7 +1141,9 @@ def __init__(self, opts): self.__criu_bin = opts['criu_bin'] self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] + self.__preload_libfault = bool(opts['preload_libfault']) self.__mntns_compat_mode = bool(opts['mntns_compat_mode']) + self.__cuda_checkpoint = bool(opts['mocked_cuda_checkpoint']) if opts['rpc']: self.__criu = criu_rpc @@ -1148,6 +1226,9 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): s_args = ["--log-file", log, "--images-dir", self.__ddir(), "--verbosity=4"] + opts + if self.__cuda_checkpoint: + s_args += [ "--libdir" , os.path.join(os.getcwd(), "..", "plugins", "cuda") ] + with open(os.path.join(self.__ddir(), action + '.cropt'), 'w') as f: f.write(' '.join(s_args) + '\n') @@ -1192,8 +1273,10 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): with open("/proc/sys/kernel/ns_last_pid") as ns_last_pid_fd: ns_last_pid = ns_last_pid_fd.read() + preload_libfault = self.__preload_libfault and action in ['dump', 'pre-dump', 'restore'] + ret = self.__criu.run(action, s_args, self.__criu_bin, self.__fault, - strace, preexec, nowait) + strace, preexec, preload_libfault, nowait) if nowait: os.close(status_fds[1]) @@ -1233,8 +1316,8 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): return rst_succeeded = os.access( os.path.join(__ddir, "restore-succeeded"), os.F_OK) - if self.__test.blocking() or (self.__sat and action == 'restore' and - rst_succeeded): + if (self.__test.blocking() and not self.__criu.exit_signal(ret)) or \ + (self.__sat and action == 'restore' and rst_succeeded): raise test_fail_expected_exc(action) else: raise test_fail_exc("CRIU %s" % action) @@ -2010,12 +2093,20 @@ def __init__(self, opts, nr_tests): file=self.__file_report) print(u"# ", file=self.__file_report) print(u"1.." + str(nr_tests), file=self.__file_report) - with open("/proc/sys/kernel/tainted") as taintfd: - self.__taint = taintfd.read() + self.__taint = self.__read_kernel_tainted() if int(self.__taint, 0) != 0: - print("The kernel is tainted: %r" % self.__taint) - if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != '1': - raise Exception("The kernel is tainted: %r" % self.__taint) + self.__report_kernel_taint("The kernel is tainted: %r" % self.__taint) + + @staticmethod + def __read_kernel_tainted(): + with open("/proc/sys/kernel/tainted") as taintfd: + return taintfd.read().strip() + + @staticmethod + def __report_kernel_taint(msg): + print(msg) + if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != "1": + raise Exception(msg) def __show_progress(self, msg): perc = int(self.__nr * 16 / self.__total) @@ -2041,11 +2132,12 @@ def run_test(self, name, desc, flavor): if len(self.__subs) >= self.__max: self.wait() - with open("/proc/sys/kernel/tainted") as taintfd: - taint = taintfd.read() + taint = self.__read_kernel_tainted() if self.__taint != taint: - raise Exception("The kernel is tainted: %r (%r)" % - (taint, self.__taint)) + prev_taint = self.__taint + self.__taint = taint + self.__report_kernel_taint( + "The kernel is tainted: %r (was %r)" % (taint, prev_taint)) ''' The option --link-remap allows criu to hardlink open files back to the @@ -2074,7 +2166,7 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless') + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2087,8 +2179,11 @@ def run_test(self, name, desc, flavor): if opts['rootless'] and os.getuid() == 0: os.setgid(NON_ROOT_UID) os.setuid(NON_ROOT_UID) + env = dict(os.environ, CR_CT_TEST_INFO=arg) + if opts['mocked_cuda_checkpoint']: + env['PATH'] = os.path.join(os.getcwd(), "cuda-checkpoint") + ":" + env["PATH"] sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], - env=dict(os.environ, CR_CT_TEST_INFO=arg), + env=env, stdout=log, stderr=subprocess.STDOUT, close_fds=True) @@ -2228,9 +2323,21 @@ def all_tests(opts): continue files.append(fp) excl = list(map(lambda x: os.path.join(desc['dir'], x), desc['exclude'])) - tlist = list(filter( + tlist = list(sorted(filter( lambda x: not x.endswith('.checkskip') and not x.endswith('.hook') and - x not in excl, map(lambda x: x.strip(), files))) + x not in excl, map(lambda x: x.strip(), files)))) + + if opts.get('test_shard_count'): + if opts.get('test_shard_index') is None: + raise KeyError('--test_shard_count > 0 must come with --test_shard_index') + slice_idx = opts['test_shard_index'] + slices = opts['test_shard_count'] + if slice_idx >= slices: + raise IndexError('--test_shard_index not less than --test_shard_count ({} >= {})'.format(slice_idx, slices)) + slist = list(tlist[slice_idx::slices]) + print("We're shard #{} of {}. Running {} of {} tests.\n".format(slice_idx, slices, len(slist), len(tlist))) + tlist = slist + return tlist @@ -2341,11 +2448,6 @@ def run_tests(opts): return torun = list(torun) - if opts['keep_going'] and len(torun) < 2: - print( - "[WARNING] Option --keep-going is more useful when running multiple tests" - ) - opts['keep_going'] = False if opts['exclude']: excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") @@ -2388,6 +2490,7 @@ def run_tests(opts): "Specify --criu-image-streamer-dir or modify PATH to provide an alternate location") .format(streamer_dir)) + usernsIsSupported = criu.check("userns") launcher = Launcher(opts, len(torun)) try: for t in torun: @@ -2457,7 +2560,7 @@ def run_tests(opts): run_flavs = set(test_flavs) & set(opts_flavs) else: run_flavs = set([test_flavs.pop()]) - if not criu.check("userns"): + if not usernsIsSupported: run_flavs -= set(['uns']) if opts['user']: # FIXME -- probably uns will make sense @@ -2767,6 +2870,19 @@ def get_cli_args(): rp.add_argument("--mntns-compat-mode", help="Use old compat mounts restore engine", action='store_true') + rp.add_argument("--test-shard-index", type=int, default=None, + help="Select tests for a shard (0-based)") + rp.add_argument("--test-shard-count", type=int, default=0, + help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") + rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") + rp.add_argument("--criu-plugin", + help="Run tests with CRIU plugin", + choices=['amdgpu', 'cuda', 'inventory_test_enabled', 'inventory_test_disabled'], + nargs='+', + default=None) + rp.add_argument("--mocked-cuda-checkpoint", + action="store_true", + help="Run criu with the cuda plugin and the mocked cuda-checkpoint tool") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index d345233154..24f32c6068 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -23,12 +23,12 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 else ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) # To build aarch32 on armv8 Travis-CI (see criu Makefile) - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif endif @@ -40,8 +40,8 @@ endif PKG_CONFIG ?= pkg-config CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes -CFLAGS += $(USERCFLAGS) -CFLAGS += -D_GNU_SOURCE +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) +CFLAGS += -D_GNU_SOURCE -D_LARGEFILE64_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include ifeq ($(strip $(V)),) diff --git a/test/zdtm/criu_config.py b/test/zdtm/criu_config.py index 487becfb4b..9fd2927476 100644 --- a/test/zdtm/criu_config.py +++ b/test/zdtm/criu_config.py @@ -11,6 +11,7 @@ def run(action, fault=None, strace=[], preexec=None, + preload=False, nowait=False): config_path = tempfile.mktemp(".conf", "criu-%s-" % action) @@ -40,3 +41,7 @@ def run(action, if nowait: return cr return cr.wait() + + @staticmethod + def exit_signal(ret): + return ret < 0 diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 90bd28f9e1..428d726d66 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -1,6 +1,6 @@ LIBDIR := . -CFLAGS += $(USERCFLAGS) +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) LIB := libzdtmtst.a @@ -34,4 +34,4 @@ clean: clean-more $(LIB): $(LIBOBJ) $(E) " AR " $@ - $(Q)ar rcs $@ $^ + $(Q)$(AR) rcs $@ $^ diff --git a/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h new file mode 100644 index 0000000000..1803aaeb44 --- /dev/null +++ b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h @@ -0,0 +1,49 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +#define atomic_get(v) (*(volatile int *)v) +#define atomic_set(v, i) (*(v) = (i)) + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(*v), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub_return(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(*ptr) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index bf8cd9cd31..efcc7a1d08 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); diff --git a/test/zdtm/lib/lock.h b/test/zdtm/lib/lock.h index 2b23550be5..cc5306e060 100644 --- a/test/zdtm/lib/lock.h +++ b/test/zdtm/lib/lock.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "asm/atomic.h" #define BUG_ON(condition) \ diff --git a/test/zdtm/lib/msg.c b/test/zdtm/lib/msg.c index 1cf92e3e01..9ba1c47a43 100644 --- a/test/zdtm/lib/msg.c +++ b/test/zdtm/lib/msg.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -55,7 +56,7 @@ void test_msg(const char *format, ...) off += strftime(buf, sizeof(buf), "%H:%M:%S", tm); } - off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); + off += sprintf(buf + off, ".%.3" PRId64 ": ", (int64_t)(tv.tv_usec / 1000)); off += sprintf(buf + off, "%5d: ", getpid()); skip: diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 6f6cccc992..3c0dbdeb80 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,7 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path; + char *root, *criu_path, *dev_path; char path[PATH_MAX]; root = getenv("ZDTM_ROOT"); @@ -51,6 +52,19 @@ static int prepare_mntns(void) return -1; } + dev_path = getenv("ZDTM_DEV"); + if (dev_path) { + snprintf(path, sizeof(path), "%s/dev", root); + if (mount(dev_path, path, NULL, MS_BIND, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + if (mount(NULL, path, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + } + criu_path = getenv("ZDTM_CRIU"); if (criu_path) { snprintf(path, sizeof(path), "%s%s", root, criu_path); @@ -218,7 +232,7 @@ static inline int _settime(clockid_t clk_id, time_t offset) if (clk_id == CLOCK_MONOTONIC_COARSE || clk_id == CLOCK_MONOTONIC_RAW) clk_id = CLOCK_MONOTONIC; - len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset); + len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, (int64_t)offset); fd = open("/proc/self/timens_offsets", O_WRONLY); if (fd < 0) { diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 6291ea4a7b..a5ba38b2dd 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -406,7 +406,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 4b3d2e3418..44ac64fe57 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -53,6 +53,12 @@ TST_NOFILE := \ shm \ shm-mp \ ptrace_sig \ + pidfd_self \ + pidfd_of_thread \ + pidfd_dead \ + pidfd_child \ + pidfd_kill \ + fd_from_pidfd \ pipe00 \ pipe01 \ pipe02 \ @@ -62,6 +68,7 @@ TST_NOFILE := \ pthread_timers \ pthread_timers_h \ rseq00 \ + membarrier \ vdso00 \ vdso01 \ vdso02 \ @@ -84,7 +91,8 @@ TST_NOFILE := \ socket-tcp4v6 \ socket-tcp-local \ socket-tcp-reuseport \ - socket-tcp-nfconntrack \ + socket-tcp-ipt-nfconntrack \ + socket-tcp-nft-nfconntrack \ socket-tcp6-local \ socket-tcp4v6-local \ socket-tcpbuf \ @@ -125,6 +133,8 @@ TST_NOFILE := \ sock_opts02 \ sock_ip_opts00 \ sock_ip_opts01 \ + sock_tcp_opts00 \ + sock_tcp_opts01 \ sk-unix-unconn \ sk-unix-unconn-seqpacket \ ipc_namespace \ @@ -215,6 +225,7 @@ TST_NOFILE := \ seccomp_filter_tsync \ seccomp_filter_threads \ seccomp_filter_inheritance \ + seccomp_no_new_privs \ different_creds \ vsx \ bridge \ @@ -256,6 +267,8 @@ TST_NOFILE := \ memfd02 \ memfd02-hugetlb \ memfd03 \ + memfd04 \ + memfd05 \ shmemfd \ shmemfd-priv \ time \ @@ -265,6 +278,7 @@ TST_NOFILE := \ sigtrap \ sigtrap01 \ change_mnt_context \ + fd_offset \ # jobctl00 \ PKG_CONFIG ?= pkg-config @@ -272,7 +286,7 @@ pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ - bpf_array + bpf_array endif ifneq ($(ARCH),arm) @@ -396,6 +410,7 @@ TST_DIR = \ cgroup_ignore \ cgroup_stray \ cgroup_yard \ + cgroup_threads \ unlink_fstat04 \ unlink_fstat041 \ mntns_remap \ @@ -593,7 +608,8 @@ socket-tcpbuf6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp4v6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV4V6 socket-tcp-local: CFLAGS += -D ZDTM_TCP_LOCAL -socket-tcp-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_CONNTRACK +socket-tcp-ipt-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_IPT_CONNTRACK +socket-tcp-nft-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_NFT_CONNTRACK socket_listen6: CFLAGS += -D ZDTM_IPV6 socket_listen4v6: CFLAGS += -D ZDTM_IPV4V6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 @@ -601,6 +617,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO +sock_tcp_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS mnt_ext_manual: CFLAGS += -D ZDTM_EXTMAP_MANUAL mntns_pivot_root_ro: CFLAGS += -DMNTNS_PIVOT_ROOT_RO @@ -653,6 +670,7 @@ socket-tcp6-unconn: CFLAGS += -D ZDTM_IPV6 socket-tcp4v6-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK -D ZDTM_IPV4V6 socket-tcp4v6-closing: CFLAGS += -D ZDTM_IPV4V6 memfd02-hugetlb: CFLAGS += -D ZDTM_HUGETLB +memfd05: CFLAGS += -D ZDTM_MEMFD05 sockets00-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET sockets01-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET @@ -676,6 +694,8 @@ s390x_gs_threads: LDFLAGS += -pthread thread_different_uid_gid: LDLIBS += -pthread -lcap +cgroup_threads: LDFLAGS += -pthread + bpf_hash: LDLIBS += -lbpf bpf_array: LDLIBS += -lbpf diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 713ffaa469..dc16368217 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 76de8b8b49..0bc36048cf 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/cgroup00.desc b/test/zdtm/static/cgroup00.desc index 3c6c4a7e22..42a3f2b73a 100644 --- a/test/zdtm/static/cgroup00.desc +++ b/test/zdtm/static/cgroup00.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index bc8515264d..7bfb677623 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup01.desc b/test/zdtm/static/cgroup01.desc index 3c6c4a7e22..42a3f2b73a 100644 --- a/test/zdtm/static/cgroup01.desc +++ b/test/zdtm/static/cgroup01.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 6229a8a089..8a925c0a43 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/cgroup02.desc b/test/zdtm/static/cgroup02.desc index df17a57891..eb5a9dd372 100644 --- a/test/zdtm/static/cgroup02.desc +++ b/test/zdtm/static/cgroup02.desc @@ -1,4 +1,4 @@ { 'dopts': '--manage-cgroups --cgroup-root name=zdtmtst:/prefix', - 'flags': 'suid', + 'flags': 'suid excl', 'flavor': 'h', 'ropts': '--manage-cgroups --cgroup-root /newroot --cgroup-root name=zdtmtst:/prefix'} diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index 8c40ffd6bd..f586a0628d 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -17,25 +17,25 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); -static const char *cgname = "zdtmtst"; +static const char *const cgname = "zdtmtst"; int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", dirname); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", subdir); return -1; } if (mount("none", subdir, "cgroup", 0, controller)) { - pr_perror("Can't mount cgroups"); + pr_perror("Can't mount cgroup controller %s at %s", controller, subdir); goto err_rd; } @@ -52,7 +52,8 @@ int mount_and_add(const char *controller, const char *path, const char *prop, co goto err_rs; ssprintf(paux, "%s/%s/special_prop_check", subdir, path); - mkdir(paux, 0600); + if (mkdir(paux, 0600) < 0) + pr_perror("Can't make dir %s", paux); return 0; err_rs: @@ -74,11 +75,11 @@ bool checkval(char *path, char *val) } n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) + pr_perror("read %s", path); close(fd); - if (n < 0) { - pr_perror("read"); + if (n < 0) return false; - } buf[n] = 0; if (strcmp(val, buf)) { @@ -95,7 +96,7 @@ int main(int argc, char **argv) char buf[1024], path[PATH_MAX]; struct stat sb; - char *dev_allow[] = { + const char *const dev_allow[] = { "c *:* m", "b *:* m", "c 1:3 rwm", "c 1:5 rwm", "c 1:7 rwm", "c 5:0 rwm", "c 5:2 rwm", "c 1:8 rwm", "c 1:9 rwm", "c 136:* rwm", "c 10:229 rwm", }; @@ -126,12 +127,14 @@ int main(int argc, char **argv) sprintf(path, "%s/devices/%s/devices.list", dirname, cgname); if (!checkval(path, buf)) { + errno = 0; fail(); goto out; } sprintf(path, "%s/memory/%s/memory.limit_in_bytes", dirname, cgname); if (!checkval(path, "268435456\n")) { + errno = 0; fail(); goto out; } @@ -143,6 +146,7 @@ int main(int argc, char **argv) } if (!S_ISDIR(sb.st_mode)) { + errno = 0; fail("special_prop_check not a directory?"); goto out; } diff --git a/test/zdtm/static/cgroup04.checkskip b/test/zdtm/static/cgroup04.checkskip index 205f8fc530..1ccbada4d0 100755 --- a/test/zdtm/static/cgroup04.checkskip +++ b/test/zdtm/static/cgroup04.checkskip @@ -1,3 +1,20 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +for ctl in devices memory; do + # Check that the controller is available. + + grep -q "^${ctl}\\s" /proc/cgroups + + # Check that the controller is not co-mounted with any other. + + # /proc/self/cgroup may have: + # "1:devices:/sys" + if ! grep -q "^[0-9]*:${ctl}:" /proc/self/cgroup; then + # but not eg: + # "1:devices,job:/sys" + grep -qE "^[0-9]*:([^:]*,)?${ctl}(,[^:]*)?:" /proc/self/cgroup && exit 1 + fi +done diff --git a/test/zdtm/static/cgroup_ifpriomap.checkskip b/test/zdtm/static/cgroup_ifpriomap.checkskip index 205f8fc530..f401ad1b24 100755 --- a/test/zdtm/static/cgroup_ifpriomap.checkskip +++ b/test/zdtm/static/cgroup_ifpriomap.checkskip @@ -1,3 +1,6 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +grep -q '^net_prio\s' /proc/cgroups diff --git a/test/zdtm/static/cgroup_threads.c b/test/zdtm/static/cgroup_threads.c new file mode 100644 index 0000000000..2c17e13a77 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.c @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroup layout of threads is preserved"; +const char *test_author = "Michał Cłapiński "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; +#define SUBNAME "subcg_threads" +#define SUBNAME2 SUBNAME "/subsubcg" + +#define exit_group(code) syscall(__NR_exit_group, code) + +static int cg_move(char *name) +{ + int cgfd, l; + char paux[256]; + + sprintf(paux, "%s/%s", dirname, name); + if (mkdir(paux, 0600)) { + pr_perror("Can't create %s", paux); + return -1; + } + + sprintf(paux, "%s/%s/tasks", dirname, name); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + return -1; + } + + l = write(cgfd, "0", 2); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + return -1; + } + + return 0; +} + +static int cg_check(char *name) +{ + int found = 0; + FILE *cgf; + char paux[256], aux[128]; + + cgf = fopen("/proc/thread-self/cgroup", "r"); + if (cgf == NULL) + return -1; + + sprintf(aux, "name=%s:/%s", cgname, name); + while (fgets(paux, sizeof(paux), cgf)) { + char *s; + + s = strchr(paux, ':') + 1; + s[strlen(s) - 1] = '\0'; + test_msg("CMP [%s] vs [%s]\n", s, aux); + if (!strcmp(s, aux)) { + found = 1; + break; + } + } + + fclose(cgf); + + return found ? 0 : -1; +} + +int th_sync[2], rst_sync[2]; + +void *thread_fn(void *args) +{ + int status = cg_move(SUBNAME2); + + if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { + pr_perror("write"); + exit_group(1); + } + + if (status == 0) { + if (read(rst_sync[0], &status, sizeof(status)) < 0) { + pr_perror("read"); + exit_group(1); + } + + status = cg_check(SUBNAME2); + if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { + pr_perror("write"); + exit_group(1); + } + } + + pthread_exit(0); +} + +int main(int argc, char **argv) +{ + int status, exit_code = 1; + pthread_t thread; + char aux[64]; + + test_init(argc, argv); + + /* + * Pipe to talk to the kid. + * First, it reports that it's ready (int), + * then it reports the restore status (int). + */ + + if (pipe(th_sync)) { + pr_perror("pipe"); + return 1; + } + + /* "Restore happened" pipe */ + if (pipe(rst_sync)) { + pr_perror("pipe"); + return 1; + } + + if (mkdir(dirname, 0700) < 0) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(aux, "none,name=%s", cgname); + if (mount("none", dirname, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto out_rd; + } + + if (cg_move(SUBNAME)) + goto out_rs; + + if (pthread_create(&thread, NULL, thread_fn, NULL)) { + pr_perror("Can't create a new thread"); + goto out_rs; + } + + status = -1; + read(th_sync[0], &status, sizeof(status)); + if (status != 0) { + pr_perror("Error moving into cgroups"); + close(rst_sync[0]); + goto out_rs; + } + + test_daemon(); + test_waitsig(); + + close(rst_sync[1]); + + status = -1; + if (read(th_sync[0], &status, sizeof(status)) < 0) { + pr_perror("read"); + goto out_rs; + } + if (status != 0) { + fail("child cg changed"); + goto out_rs; + } + + pass(); + exit_code = 0; + +out_rs: + umount(dirname); +out_rd: + rmdir(dirname); +out: + return exit_code; +} diff --git a/test/zdtm/static/cgroup_threads.desc b/test/zdtm/static/cgroup_threads.desc new file mode 100644 index 0000000000..42a3f2b73a --- /dev/null +++ b/test/zdtm/static/cgroup_threads.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup_threads.hook b/test/zdtm/static/cgroup_threads.hook new file mode 100755 index 0000000000..f4b553d347 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.hook @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +tname=$(mktemp -d cgclean.XXXXXX) +trap 'rmdir "${tname}"' EXIT + +mount -t cgroup none $tname -o "none,name=zdtmtst" +trap 'umount "${tname}"; rmdir "${tname}"' EXIT + +echo "Cleaning $tname" + +rmdir "$tname/subcg_threads/subsubcg/" || true +rmdir "$tname/subcg_threads/" || true + +echo "Left there is:" +ls "$tname" diff --git a/test/zdtm/static/cgroup_yard.desc b/test/zdtm/static/cgroup_yard.desc index 8736d6780d..9ad4a9b578 100644 --- a/test/zdtm/static/cgroup_yard.desc +++ b/test/zdtm/static/cgroup_yard.desc @@ -1,6 +1,6 @@ { 'flavor': 'h', -'flags': 'suid', +'flags': 'suid excl', # We create the external cgroup yard in working directory during --pre-dump # hook. We have to go up a few directories to find the yard. 'opts': '--manage-cgroups --cgroup-yard ../../../../../../external_yard' diff --git a/test/zdtm/static/cgroup_yard.hook b/test/zdtm/static/cgroup_yard.hook index d06bc45fde..b70bd59e94 100755 --- a/test/zdtm/static/cgroup_yard.hook +++ b/test/zdtm/static/cgroup_yard.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/zdtm/static/cgroupns.desc b/test/zdtm/static/cgroupns.desc index 80dd710e17..dc61e36cff 100644 --- a/test/zdtm/static/cgroupns.desc +++ b/test/zdtm/static/cgroupns.desc @@ -1,4 +1,4 @@ { 'feature': 'cgroupns', - 'flags': 'suid', + 'flags': 'suid excl', 'flavor': 'h', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroupv2_00.desc b/test/zdtm/static/cgroupv2_00.desc index 4bfd4b2656..e70c84df81 100644 --- a/test/zdtm/static/cgroupv2_00.desc +++ b/test/zdtm/static/cgroupv2_00.desc @@ -1 +1 @@ -{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} +{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_01.desc b/test/zdtm/static/cgroupv2_01.desc index 4bfd4b2656..e70c84df81 100644 --- a/test/zdtm/static/cgroupv2_01.desc +++ b/test/zdtm/static/cgroupv2_01.desc @@ -1 +1 @@ -{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} +{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 6d436014b3..8787ae5cf9 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %s", opts); + result = sscanf(pos, " - %*s %*s %1023s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/child_opened_proc.c b/test/zdtm/static/child_opened_proc.c index 2125cd264e..cfe04fa4be 100644 --- a/test/zdtm/static/child_opened_proc.c +++ b/test/zdtm/static/child_opened_proc.c @@ -10,7 +10,7 @@ #include "zdtmtst.h" const char *test_doc = "Check that tree prior to files opening"; -const char *test_author = "Stanislav Kinsbursky +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if fd obtained from pidfd_get_fd is C/R correctly\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) +{ + return syscall(__NR_pidfd_getfd, pidfd, targetfd, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int pidfd, child, p[2], child_read, read_data, status; + int data = 42; + + test_init(argc, argv); + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + close(p[WRITE]); + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + close(p[READ]); + if (write(p[WRITE], &data, sizeof(data)) != sizeof(data)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + child_read = pidfd_getfd(pidfd, p[READ], 0); + if (child_read < 0) { + pr_perror("pidfd_getfd"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read(child_read, &read_data, sizeof(read_data)) != sizeof(read_data)) { + pr_perror("read"); + goto err_close; + } + + if (read_data != data) { + fail("data from fd obtained using pidfd_getfd incorrect"); + goto err_close; + } + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + pr_perror("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + close(child_read); + close(pidfd); + return 0; +err_close: + close(child_read); + close(pidfd); + return 1; +} diff --git a/test/zdtm/static/fd_offset.c b/test/zdtm/static/fd_offset.c new file mode 100644 index 0000000000..96255a4a1f --- /dev/null +++ b/test/zdtm/static/fd_offset.c @@ -0,0 +1,42 @@ +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check that criu properly restores offsets on ELF files"; +const char *test_author = "Michal Clapinski "; + +void check_offset(int fd) +{ + int offset = lseek(fd, 0, SEEK_CUR); + if (offset < 0) { + fail("lseek"); + exit(1); + } + if (offset != 0) { + fail("wrong offset; expected: 0, got: %d", offset); + exit(1); + } +} + +int main(int argc, char **argv) +{ + int fd; + + test_init(argc, argv); + + fd = open("/proc/self/exe", O_RDONLY); + if (fd < 0) { + fail("open"); + exit(1); + } + check_offset(fd); + + test_daemon(); + test_waitsig(); + + check_offset(fd); + + pass(); + return 0; +} diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index beea171f5d..bfdca51d93 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index d2049ebaa2..ae4827de97 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 35ef41a21b..228e668925 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 11d224fa70..7e0d2654e1 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/file_locks06.checkskip b/test/zdtm/static/file_locks06.checkskip index 06ab585216..c5039a2d23 100755 --- a/test/zdtm/static/file_locks06.checkskip +++ b/test/zdtm/static/file_locks06.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import fcntl import tempfile import struct diff --git a/test/zdtm/static/macvlan.checkskip b/test/zdtm/static/macvlan.checkskip new file mode 100755 index 0000000000..f4e0609536 --- /dev/null +++ b/test/zdtm/static/macvlan.checkskip @@ -0,0 +1,38 @@ +#!/bin/bash + +FAIL=0 + +create_macvlan_device() { + if ! ip link add test_mvlan1 type veth >/dev/null 2>&1; then + FAIL=1 + fi + if ! ip link add mymacvlan1 link test_mvlan1 type macvlan >/dev/null 2>&1; then + FAIL=1 + fi + + return "${FAIL}" +} + +cleanup() { + ip link del test_mvlan1 >/dev/null 2>&1 + ip link del mymacvlan1 >/dev/null 2>&1 +} + +trap "cleanup" QUIT TERM INT HUP EXIT + +# Test once without loading the module +if create_macvlan_device; then + exit 0 +fi + +# Test once more with explicitly loading the module +if ! modprobe macvlan >/dev/null 2>&1; then + exit 1 +fi +create_macvlan_device + +if [ "${FAIL}" == "1" ]; then + exit 1 +fi + +exit 0 diff --git a/test/zdtm/static/maps00.c b/test/zdtm/static/maps00.c index b1e55e8614..f6989f3af7 100644 --- a/test/zdtm/static/maps00.c +++ b/test/zdtm/static/maps00.c @@ -137,7 +137,7 @@ static int check_map(struct map *map) } /* prot |= PROT_READ// need barrier before this line, because compiler change order commands. - I finded one method: look at next lines*/ + I found one method: look at next lines*/ } else prot &= PROT_WRITE | !PROT_READ | PROT_EXEC; diff --git a/test/zdtm/static/membarrier.c b/test/zdtm/static/membarrier.c new file mode 100644 index 0000000000..85d705ba7b --- /dev/null +++ b/test/zdtm/static/membarrier.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test membarrier() migration"; +const char *test_author = "Michał Mirosław "; + +/* + * Define membarrier() CMDs to avoid depending on exact kernel header version. + */ +#define MEMBARRIER_CMD_GLOBAL_EXPEDITED (1 << 1) +#define MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED (1 << 2) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE (1 << 5) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE (1 << 6) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) +#define MEMBARRIER_CMD_GET_REGISTRATIONS (1 << 9) + +static int membarrier(int cmd, unsigned int flags, int cpu_id) +{ + return syscall(__NR_membarrier, cmd, flags, cpu_id); +} + +static const struct { + const char *name_suffix; + int register_cmd; + int execute_cmd; +} membarrier_cmds[] = { + { "GLOBAL_EXPEDITED", MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_GLOBAL_EXPEDITED }, + { "PRIVATE_EXPEDITED", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_PRIVATE_EXPEDITED }, + { "PRIVATE_EXPEDITED_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, + { "PRIVATE_EXPEDITED_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, +}; +static const int n_membarrier_cmds = sizeof(membarrier_cmds) / sizeof(*membarrier_cmds); + +static int register_membarriers(void) +{ + int barriers_supported, barriers_registered; + bool all_ok = true; + + barriers_supported = membarrier(MEMBARRIER_CMD_QUERY, 0, 0); + if (barriers_supported < 0) { + fail("membarrier() not supported by running kernel"); + return -1; + } + + barriers_registered = 0; + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_supported & membarrier_cmds[i].register_cmd) + continue; + + barriers_registered |= membarrier_cmds[i].register_cmd; + + if (membarrier(membarrier_cmds[i].register_cmd, 0, 0) < 0) { + pr_perror("membarrier(REGISTER_%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) { + fail("can't register membarrier()s - tried %#x, kernel %#x", + barriers_registered, barriers_supported); + return -1; + } + + if (!barriers_registered) { + fail("no known membarrier() cmds are supported by the kernel"); + return -1; + } + + return barriers_registered; +} + +static bool check_membarriers_compat(int barriers_registered) +{ + bool all_ok = true; + + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_registered & membarrier_cmds[i].register_cmd) + continue; + if (membarrier(membarrier_cmds[i].execute_cmd, 0, 0) < 0) { + pr_perror("membarrier(%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) + fail("membarrier() check failed"); + + return all_ok; +} + +static bool check_membarriers_get_registrations(int barriers_registered) +{ + int ret = membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + if (errno == EINVAL) { + test_msg("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS) not supported by running kernel"); + return true; + } + fail("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS)"); + return false; + } + if (ret != barriers_registered) { + fail("MEMBARRIER_CMD_GET_REGISTRATIONS check failed, expected: %d, got: %d", + barriers_registered, ret); + return false; + } + + return true; +} + +static bool check_membarriers(int barriers_registered) +{ + return check_membarriers_compat(barriers_registered) && + check_membarriers_get_registrations(barriers_registered); +} + +int main(int argc, char **argv) +{ + int barriers_registered; + + test_init(argc, argv); + + barriers_registered = register_membarriers(); + if (barriers_registered < 0) + return 1; + + test_msg("Pre-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + test_daemon(); + test_waitsig(); + + test_msg("Post-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + pass(); + return 0; +} diff --git a/test/zdtm/static/memfd00.c b/test/zdtm/static/memfd00.c index d037f69697..8d77ed06eb 100644 --- a/test/zdtm/static/memfd00.c +++ b/test/zdtm/static/memfd00.c @@ -30,8 +30,10 @@ int main(int argc, char *argv[]) { int fd, fl_flags1, fl_flags2, fd_flags1, fd_flags2; struct statfs statfs1, statfs2; + struct stat stat; off_t pos1, pos2; char buf[5]; + int fmode1, fmode2; test_init(argc, argv); @@ -58,6 +60,13 @@ int main(int argc, char *argv[]) if (lseek(fd, pos1, SEEK_SET) < 0) err(1, "seek error"); + if (fchmod(fd, 0642)) + err(1, "Can't set permission bits"); + + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode1 = stat.st_mode; + test_daemon(); test_waitsig(); @@ -85,6 +94,15 @@ int main(int argc, char *argv[]) return 1; } + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode2 = stat.st_mode; + + if (fmode1 != fmode2) { + fail("stat.st_mode = %#o != %#o", fmode2, fmode1); + return 1; + } + pos2 = lseek(fd, 0, SEEK_CUR); if (pos1 != pos2) { fail("position differs"); diff --git a/test/zdtm/static/memfd04.c b/test/zdtm/static/memfd04.c new file mode 100644 index 0000000000..215e949d15 --- /dev/null +++ b/test/zdtm/static/memfd04.c @@ -0,0 +1,132 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "exec(memfd)"; +const char *test_author = "Michał Mirosław "; + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + +static int _execveat(int dirfd, const char *pathname, const char *const argv[], const char *const envp[], int flags) +{ + return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); +} + +static const char *const script_argv[] = { "true", NULL }; +static const char *const script_env[] = { NULL }; + +static bool test_exec_fd(int fd) +{ + int err, pid, status; + + err = fcntl(fd, F_GETFD); + if (err < 0) { + fail("fcntl(F_GETFD)"); + return false; + } + if (err) { + errno = 0; + fail("F_GETFD for the memfd returned %d but expected 0", err); + return false; + } + + pid = fork(); + if (!pid) { + _execveat(fd, "", script_argv, script_env, AT_EMPTY_PATH); + err = errno; + pr_perror("execveat()"); + _exit(err); + } + + if (pid < 0) { + fail("fork()"); + return false; + } + + while (waitpid(pid, &status, 0) != pid) { + if (errno == EINTR) + continue; + fail("waitpid(child=%d)", pid); + return false; + } + + if (status != 0) { + pr_err("child exited with status=%d\n", status); + return false; + } + + return true; +} + +static const char script[] = "#!/bin/true"; +static const size_t script_len = sizeof(script) - 1; + +int main(int argc, char *argv[]) +{ +#ifdef MEMFD05 + char path[PATH_MAX]; + char *addr_p, *addr_s; + int rofd; +#endif + int fd; + + test_init(argc, argv); + + fd = _memfd_create("somename", 0); + if (fd < 0) { + pr_perror("memfd_create()"); + return 1; + } + if (ftruncate(fd, script_len) == -1) { + pr_perror("ftruncate"); + return 1; + } + if (write(fd, script, script_len) != script_len) { + pr_perror("write(memfd)"); + return 1; + } +#ifdef MEMFD05 + snprintf(path, PATH_MAX - 1, "/proc/self/fd/%d", fd); + rofd = open(path, O_RDONLY); + if (rofd < 0) { + pr_perror("unable to open read-only memfd"); + return 1; + } + addr_p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, rofd, 0); + if (addr_p == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + addr_s = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr_s == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } +#endif + + if (!test_exec_fd(fd)) + return 1; + + test_msg("execveat(memfd) succeeded before C/R.\n"); + + test_daemon(); + test_waitsig(); + + if (!test_exec_fd(fd)) + return 1; + + pass(); + + return 0; +} diff --git a/test/zdtm/static/memfd04.desc b/test/zdtm/static/memfd04.desc new file mode 100644 index 0000000000..bbf136d145 --- /dev/null +++ b/test/zdtm/static/memfd04.desc @@ -0,0 +1 @@ +{'deps': ['/bin/true']} diff --git a/test/zdtm/static/memfd05.c b/test/zdtm/static/memfd05.c new file mode 120000 index 0000000000..6caa9556fb --- /dev/null +++ b/test/zdtm/static/memfd05.c @@ -0,0 +1 @@ +memfd04.c \ No newline at end of file diff --git a/test/zdtm/static/memfd05.desc b/test/zdtm/static/memfd05.desc new file mode 120000 index 0000000000..1b4963572b --- /dev/null +++ b/test/zdtm/static/memfd05.desc @@ -0,0 +1 @@ +memfd04.desc \ No newline at end of file diff --git a/test/zdtm/static/mntns_open.c b/test/zdtm/static/mntns_open.c index 7d8bbbaa4e..0430f5b998 100644 --- a/test/zdtm/static/mntns_open.c +++ b/test/zdtm/static/mntns_open.c @@ -17,7 +17,7 @@ #define CLONE_NEWNS 0x00020000 #endif -const char *test_doc = "Check that mnt_id is repsected"; +const char *test_doc = "Check that mnt_id is respected"; const char *test_author = "Pavel Emelianov "; #define MPTS_FILE "F" diff --git a/test/zdtm/static/mntns_root_bind.c b/test/zdtm/static/mntns_root_bind.c index 9e1ba06e63..4c0347cb22 100644 --- a/test/zdtm/static/mntns_root_bind.c +++ b/test/zdtm/static/mntns_root_bind.c @@ -71,7 +71,7 @@ int main(int argc, char **argv) task_waiter_wait4(&t, 2); if (access(bspath, F_OK)) { - fail("%s isn't accessiable", bspath); + fail("%s isn't accessible", bspath); return 1; } diff --git a/test/zdtm/static/mtime_mmap.c b/test/zdtm/static/mtime_mmap.c index faa2d6fad7..4de8438ee2 100644 --- a/test/zdtm/static/mtime_mmap.c +++ b/test/zdtm/static/mtime_mmap.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -77,7 +78,7 @@ int main(int argc, char **argv) mtime_new = fst.st_mtime; /* time of last modification */ if (mtime_new <= mtime_old) { - fail("mtime %ld wasn't updated on mmapped %s file", mtime_new, filename); + fail("mtime %" PRId64 " wasn't updated on mmapped %s file", (int64_t)mtime_new, filename); goto failed; } @@ -98,7 +99,7 @@ int main(int argc, char **argv) /* time of last modification */ if (fst.st_mtime != mtime_new) { - fail("After migration, mtime changed to %ld", fst.st_mtime); + fail("After migration, mtime changed to %" PRId64, (int64_t)fst.st_mtime); goto failed; } diff --git a/test/zdtm/static/net_lock_socket_iptables.hook b/test/zdtm/static/net_lock_socket_iptables.hook index 0ee147eb2b..e9fcd73509 100755 --- a/test/zdtm/static/net_lock_socket_iptables.hook +++ b/test/zdtm/static/net_lock_socket_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import socket import time diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index 1e6ee1dea5..f268f2fece 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%s", val); + ret = fscanf(fp, "%200s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/netns_lock_iptables.hook b/test/zdtm/static/netns_lock_iptables.hook index e7daf8a655..b51d3c2cc2 100755 --- a/test/zdtm/static/netns_lock_iptables.hook +++ b/test/zdtm/static/netns_lock_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import socket @@ -67,7 +67,7 @@ if sys.argv[1] == "--post-start": cln, addr = srv.accept() cln.sendall(str.encode("--post-restore")) cln.close() - + # Server will be closed when zdtm sends SIGKILL if sys.argv[1] == "--pre-dump": diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index 68b6f22f52..a68fa38eeb 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); diff --git a/test/zdtm/static/pidfd_child.c b/test/zdtm/static/pidfd_child.c new file mode 100644 index 0000000000..ec559605dc --- /dev/null +++ b/test/zdtm/static/pidfd_child.c @@ -0,0 +1,66 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks pidfd sends signal to child process after restore\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + int pidfd, status; + pid_t child; + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("Unable to fork a new process"); + return 1; + } else if (child == 0) { + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + fail("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + goto err_close; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + goto err_close; + } + + pass(); + close(pidfd); + return 0; +err_close: + close(pidfd); + return 1; +} diff --git a/test/zdtm/static/pidfd_dead.c b/test/zdtm/static/pidfd_dead.c new file mode 100644 index 0000000000..9c825899d1 --- /dev/null +++ b/test/zdtm/static/pidfd_dead.c @@ -0,0 +1,244 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of pidfds that point to dead processes\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main opens a pidfd for both child and grandchild. + * Before C/R we kill both child and grandchild. + * We end up with two unique dead pidfds. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int open_pidfd_pair(int pidfd[2], int pid) +{ + pidfd[0] = pidfd_open(pid, 0); + if (pidfd[0] < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + pidfd[1] = pidfd_open(pid, 0); + if (pidfd[1] < 0) { + close(pidfd[0]); + pr_perror("pidfd_open() failed"); + return 1; + } + return 0; +} + +static int compare_pidfds(int pidfd[2]) +{ + /* + * After linux 6.9 we can compare inode numbers + * to determine if two pidfds point to the same process. + * While the inode number may change before and after C/R + * pidfds pointing to the same pid should have the same inode number. + */ + struct statx stats[2]; + statx(pidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(pidfd[1], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino != stats[1].stx_ino) + return 1; + return 0; +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, ret, gchild, p[2], status; + int cpidfd[2], gpidfd[2]; + struct statx stats[2]; + + test_init(argc, argv); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild = test_fork(); + close(p[READ]); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[WRITE]); + while(1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + + return 0; + } + } + + ret = open_pidfd_pair(cpidfd, child); + if (ret) + return 1; + + close(p[WRITE]); + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[READ]); + + ret = open_pidfd_pair(gpidfd, gchild); + if (ret) + return 1; + + /* + * We kill grandchild and child processes only after opening pidfds. + */ + if (pidfd_send_signal(gpidfd[0], SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + goto fail_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto fail_close; + } + + if (!WIFEXITED(status)) { + fail("Expected child to exit normally"); + goto fail_close; + } + + if (WEXITSTATUS(status) != 0) { + fail("Expected child to exit with 0"); + goto fail_close; + } + usleep(1000); + + if (kill(gchild, 0) != -1 && errno != ESRCH) { + fail("Expected grand child to not exist"); + goto fail_close; + } + + if (kill(child, 0) != -1 && errno != ESRCH) { + fail("Expected child to not exist"); + goto fail_close; + } + + test_daemon(); + test_waitsig(); + + ret = compare_pidfds(cpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + ret = compare_pidfds(gpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + statx(cpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(gpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino == stats[1].stx_ino) { + fail("pidfds pointing to diff pids should have diff inodes"); + goto fail_close; + } + + pass(); + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 0; + +fail_close: + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 1; +} diff --git a/test/zdtm/static/pidfd_kill.c b/test/zdtm/static/pidfd_kill.c new file mode 100644 index 0000000000..6232d033aa --- /dev/null +++ b/test/zdtm/static/pidfd_kill.c @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Kill child and grandchild process using pidfds\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int wait_for_child(int child) +{ + int status; + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + test_msg("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, gchild, cpidfd, gpidfd, gchild_pid, ret; + int p[2]; + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + gchild = fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } + + if (gchild == 0) { + test_waitsig(); + return 0; + } + + close(p[READ]); + if (write(p[WRITE], &gchild, sizeof(gchild)) + != sizeof(gchild)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + test_waitsig(); + return wait_for_child(gchild); + } + + cpidfd = pidfd_open(child, 0); + if (cpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &gchild_pid, sizeof(gchild_pid)) + != sizeof(gchild_pid)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + + gpidfd = pidfd_open(gchild_pid, 0); + if (gpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(gpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + if (pidfd_send_signal(cpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + ret = wait_for_child(child); + if (ret) + goto fail_close; + + pass(); + close(cpidfd); + close(gpidfd); + return 0; + +fail_close: + fail(); + close(cpidfd); + close(gpidfd); + return 1; +} diff --git a/test/zdtm/static/pidfd_of_thread.c b/test/zdtm/static/pidfd_of_thread.c new file mode 100644 index 0000000000..d232c7ac1d --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check C/R of pidfds that point to threads\n"; +const char *test_author = "Bhavik Sachdev "; + +/* see also: https://codebrowser.dev/glibc/glibc/sysdeps/unix/sysv/linux/tst-clone3.c.html */ + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +#ifndef PIDFD_SIGNAL_THREAD +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#endif + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int thread_func(void *a) +{ + test_waitsig(); + return 0; +} + +#define CTID_INIT_VAL 1 + +int main(int argc, char* argv[]) +{ + char st[64 * 1024] __attribute__ ((aligned)); + pid_t tid; + int pidfd, test_pidfd; + futex_t exited; + + int clone_flags = CLONE_THREAD; + clone_flags |= CLONE_VM | CLONE_SIGHAND; + clone_flags |= CLONE_CHILD_CLEARTID; + + test_init(argc, argv); + + test_pidfd = pidfd_open(getpid(), 0); + if (test_pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + /* PIDFD_THREAD, PIDFD_SIGNAL_THREAD are supported only with pidfs */ + if (get_fs_type(test_pidfd) != PID_FS_MAGIC) { + test_daemon(); + test_waitsig(); + skip("pidfs not supported."); + close(test_pidfd); + return 0; + } + close(test_pidfd); + + futex_set(&exited, CTID_INIT_VAL); + + tid = clone(thread_func, st + sizeof(st), clone_flags, NULL, NULL, NULL, &(exited.raw)); + if (tid == -1) { + pr_perror("clone() failed"); + return 1; + } + + test_msg("Successfully created a thread with tid: %d\n", tid); + pidfd = pidfd_open(tid, PIDFD_THREAD); + if (pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, PIDFD_SIGNAL_THREAD)) { + pr_perror("pidfd_send_signal() failed"); + fail(); + close(pidfd); + return 1; + } + + test_msg("Waiting for thread to exit\n"); + futex_wait_until(&exited, 0); + + pass(); + close(pidfd); + return 0; +} diff --git a/test/zdtm/static/pidfd_of_thread.desc b/test/zdtm/static/pidfd_of_thread.desc new file mode 100644 index 0000000000..802caed655 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.desc @@ -0,0 +1 @@ +{'flags': 'noauto crfail'} diff --git a/test/zdtm/static/pidfd_self.c b/test/zdtm/static/pidfd_self.c new file mode 100644 index 0000000000..2730ee123d --- /dev/null +++ b/test/zdtm/static/pidfd_self.c @@ -0,0 +1,140 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check pidfd /proc/self/fdinfo/ entry remains consistent after checkpoint/restore\n"; +const char *test_author = "Bhavik Sachdev "; + +struct pidfd_status { + unsigned int flags; + pid_t pid; +}; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static void show_pidfd(char *prefix, struct pidfd_status *s) +{ + test_msg("\n\t%s\n\tflags: 0%o\n\tpid: %d\n", prefix, s->flags, s->pid); +} + +static int parse_self_fdinfo(int pidfd, struct pidfd_status *s) +{ + char buf[256]; + int ret = -1; + FILE *f; + + sprintf(buf, "/proc/self/fdinfo/%d", pidfd); + f = fopen(buf, "r"); + if (!f) { + perror("Can't open /proc/self/fdinfo/ to parse"); + return -1; + } + + memset(s, 0, sizeof(*s)); + + /* + * flags: file access mode (octal) 02000002 => [O_RDWR | O_CLOEXEC] + * pid: the pid to which we have pidfd open + */ + while (fgets(buf, sizeof(buf), f)) { + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "flags: 0%o", &s->flags) != 1) { + goto parse_err; + } + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "Pid: %d", &s->pid) != 1) + goto parse_err; + ret = 0; + break; + } + + if (ret) + goto parse_err; +err: + fclose(f); + return ret; + +parse_err: + pr_perror("Format error"); + goto err; +} + +static int check_pidfd(int fd, struct pidfd_status *old) +{ + struct pidfd_status new; + + if (parse_self_fdinfo(fd, &new)) + return -1; + + show_pidfd("restored", &new); + + if (old->flags != new.flags || old->pid != new.pid) + return -1; + + return 0; +} + +int main(int argc, char* argv[]) +{ + struct pidfd_status old; + int pidfd, ret; + + test_init(argc, argv); + + pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + parse_self_fdinfo(pidfd, &old); + + show_pidfd("old", &old); + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = check_pidfd(pidfd, &old); + if (ret) { + fail(); + goto err; + } + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + fail(); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} diff --git a/test/zdtm/static/pthread_timers.c b/test/zdtm/static/pthread_timers.c index 5246a985fd..b1b2a9a23d 100644 --- a/test/zdtm/static/pthread_timers.c +++ b/test/zdtm/static/pthread_timers.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -69,7 +70,8 @@ int main(int argc, char **argv) } if (itimerspec.it_interval.tv_nsec != TEST_INTERVAL_NSEC || itimerspec.it_interval.tv_sec) { - pr_perror("wrong interval: %ld:%ld", itimerspec.it_interval.tv_sec, itimerspec.it_interval.tv_nsec); + pr_perror("wrong interval: %" PRId64 ":%" PRId64, + (int64_t)itimerspec.it_interval.tv_sec, (int64_t)itimerspec.it_interval.tv_nsec); return 1; } diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c index 471ad6a43f..7add7801eb 100644 --- a/test/zdtm/static/rseq00.c +++ b/test/zdtm/static/rseq00.c @@ -46,12 +46,15 @@ static inline void *__criu_thread_pointer(void) static inline void unregister_glibc_rseq(void) { struct rseq *rseq = (struct rseq *)((char *)__criu_thread_pointer() + __rseq_offset); + unsigned int size = __rseq_size; /* hack: mark glibc rseq structure as failed to register */ rseq->cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED; /* unregister rseq */ - syscall(__NR_rseq, (void *)rseq, __rseq_size, 1, RSEQ_SIG); + if (__rseq_size < 32) + size = 32; + syscall(__NR_rseq, (void *)rseq, size, 1, RSEQ_SIG); } #else static inline void unregister_glibc_rseq(void) diff --git a/test/zdtm/static/sched_policy00.c b/test/zdtm/static/sched_policy00.c index dc71eed940..a351350503 100644 --- a/test/zdtm/static/sched_policy00.c +++ b/test/zdtm/static/sched_policy00.c @@ -51,7 +51,7 @@ int main(int argc, char **argv) } p.sched_priority = param; - if (sched_setscheduler(pid, SCHED_RR, &p)) { + if (sched_setscheduler(pid, SCHED_RR | SCHED_RESET_ON_FORK, &p)) { pr_perror("Can't set policy"); kill(pid, SIGKILL); return -1; @@ -61,7 +61,7 @@ int main(int argc, char **argv) test_waitsig(); ret = sched_getscheduler(pid); - if (ret != SCHED_RR) { + if (ret != (SCHED_RR | SCHED_RESET_ON_FORK)) { fail("Broken/No policy"); err++; } diff --git a/test/zdtm/static/seccomp_filter_inheritance.c b/test/zdtm/static/seccomp_filter_inheritance.c index 7a86cd85ee..5afcb3f845 100644 --- a/test/zdtm/static/seccomp_filter_inheritance.c +++ b/test/zdtm/static/seccomp_filter_inheritance.c @@ -100,7 +100,7 @@ int main(int argc, char **argv) if (filter_syscall(__NR_ptrace) < 0) _exit(1); - if (filter_syscall(__NR_fstat) < 0) + if (filter_syscall(__NR_statx) < 0) _exit(1); zdtm_seccomp = 1; diff --git a/test/zdtm/static/seccomp_no_new_privs.c b/test/zdtm/static/seccomp_no_new_privs.c new file mode 100644 index 0000000000..95f9501ed4 --- /dev/null +++ b/test/zdtm/static/seccomp_no_new_privs.c @@ -0,0 +1,42 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that NO_NEW_PRIVS attribute is restored"; +const char *test_author = "Michał Mirosław "; + +int main(int argc, char **argv) +{ + int ret; + + test_init(argc, argv); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 0) + fail("initial NO_NEW_PRIVS = %d != 0", ret); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (ret) { + pr_perror("Can't set NO_NEW_PRIVS attribute"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 1) + fail("restored NO_NEW_PRIVS = %d != 1", ret); + + pass(); + return 0; +} diff --git a/test/zdtm/static/selinux00.checkskip b/test/zdtm/static/selinux00.checkskip index 8d946a75e3..4c85647d10 100755 --- a/test/zdtm/static/selinux00.checkskip +++ b/test/zdtm/static/selinux00.checkskip @@ -2,6 +2,19 @@ test -d /sys/fs/selinux || exit 1 +# check if necessary commands are installed +if ! command -v setenforce &>/dev/null; then + exit 1 +fi + +if ! command -v setsebool &>/dev/null; then + exit 1 +fi + +if ! command -v getsebool &>/dev/null; then + exit 1 +fi + # See selinux00.hook for details getsebool unconfined_dyntrans_all > /dev/null 2>&1 diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c index 08970c0daf..cb464365d9 100644 --- a/test/zdtm/static/sock_ip_opts00.c +++ b/test/zdtm/static/sock_ip_opts00.c @@ -3,6 +3,7 @@ #include #include +#include #include #include "zdtmtst.h" @@ -19,11 +20,14 @@ const char *test_author = "Pavel Tikhomirov "; struct sk_opt { int level; int opt; + int val; }; struct sk_opt sk_opts_v4[] = { - { SOL_IP, IP_FREEBIND }, - { SOL_IP, IP_PKTINFO }, + { SOL_IP, IP_FREEBIND, IP_OPT_VAL }, + { SOL_IP, IP_PKTINFO, IP_OPT_VAL }, + { SOL_IP, IP_TTL, 32 }, + { SOL_IP, IP_TOS, IPTOS_TOS(IPTOS_THROUGHPUT) }, }; #ifndef IPV6_FREEBIND @@ -31,8 +35,8 @@ struct sk_opt sk_opts_v4[] = { #endif struct sk_opt sk_opts_v6[] = { - { SOL_IPV6, IPV6_FREEBIND }, - { SOL_IPV6, IPV6_RECVPKTINFO }, + { SOL_IPV6, IPV6_FREEBIND, IP_OPT_VAL }, + { SOL_IPV6, IPV6_RECVPKTINFO, IP_OPT_VAL }, }; struct sk_conf { @@ -71,7 +75,7 @@ int main(int argc, char **argv) n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); for (j = 0; j < n_opts; j++) { - val = IP_OPT_VAL; + val = opts[j].val; if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); goto close; @@ -93,7 +97,7 @@ int main(int argc, char **argv) goto close; } - if (val != IP_OPT_VAL) { + if (val != opts[j].val) { fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); goto close; diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index 5b4624f6de..fcf00ffed8 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -12,22 +12,28 @@ const char *test_author = "Pavel Emelyanov "; #define TEST_PORT 59687 #define TEST_ADDR INADDR_ANY -#define NOPTS 8 - int main(int argc, char **argv) { - int sock, ret = 0, vname[NOPTS], val[NOPTS], rval, i; + #define OPT(x) { x, #x } + static const struct { + int opt; + const char *name; + } vname[] = { + OPT(SO_PRIORITY), + OPT(SO_RCVLOWAT), + OPT(SO_MARK), + OPT(SO_PASSCRED), + OPT(SO_PASSSEC), + OPT(SO_DONTROUTE), + OPT(SO_NO_CHECK), + OPT(SO_OOBINLINE), + }; + static const int NOPTS = sizeof(vname) / sizeof(*vname); + #undef OPT + + int sock, ret = 0, val[NOPTS], rval, i; socklen_t len = sizeof(int); - vname[0] = SO_PRIORITY; - vname[1] = SO_RCVLOWAT; - vname[2] = SO_MARK; - vname[3] = SO_PASSCRED; - vname[4] = SO_PASSSEC; - vname[5] = SO_DONTROUTE; - vname[6] = SO_NO_CHECK; - vname[7] = SO_OOBINLINE; - test_init(argc, argv); sock = socket(PF_INET, SOCK_STREAM, 0); @@ -37,29 +43,29 @@ int main(int argc, char **argv) } for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &val[i], &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { - pr_perror("can't get option %d", i); + pr_perror("can't get %s", vname[i].name); return 1; } val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i], &val[i], len); + ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { - pr_perror("can't set option %d", i); + pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d 2", i); + pr_perror("can't re-get %s", vname[i].name); return 1; } if (rval != val[i]) { if (rval + 1 == val[i]) { - pr_perror("can't reset option %d want %d have %d", i, val[i], rval); + pr_perror("failed to set %s: want %d have %d", vname[i].name, val[i], rval); return 1; } @@ -72,14 +78,15 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d again", i); + pr_perror("can't verify %s", vname[i].name); return 1; } if (val[i] != rval) { - fail("option %d changed", i); + errno = 0; + fail("%s changed: %d -> %d", vname[i].name, val[i], rval); return 1; } } diff --git a/test/zdtm/static/sock_tcp_opts00.c b/test/zdtm/static/sock_tcp_opts00.c new file mode 100644 index 0000000000..8061bc9ea1 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts00.c @@ -0,0 +1,96 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that different tcp socket options are restored"; +const char *test_author = "Juntong Deng "; + +#ifdef ZDTM_VAL_ZERO +#define TCP_OPT_VAL 0 +#else +#define TCP_OPT_VAL 1 +#endif + +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + +struct sk_opt { + int level; + int opt; + int val; +}; + +struct sk_opt tcp_sk_opts[] = { + { SOL_TCP, TCP_CORK, TCP_OPT_VAL }, + { SOL_TCP, TCP_NODELAY, TCP_OPT_VAL }, +}; + +struct sk_conf { + int domain; + int type; + int protocol; + int sk; +} sk_confs[] = { + { AF_INET, SOCK_STREAM, IPPROTO_TCP }, + { AF_INET6, SOCK_STREAM, IPPROTO_TCP }, +}; + +int main(int argc, char **argv) +{ + struct sk_opt *opts = tcp_sk_opts; + int n_opts = ARRAY_SIZE(tcp_sk_opts); + int exit_code = 1; + int i, j, val; + socklen_t len; + + test_init(argc, argv); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + sk_confs[i].sk = socket(sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol); + if (sk_confs[i].sk == -1) { + pr_perror("socket(%d,%d,%d) failed", sk_confs[i].domain, sk_confs[i].type, + sk_confs[i].protocol); + goto close; + } + } + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + for (j = 0; j < n_opts; j++) { + val = opts[j].val; + if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { + pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + for (j = 0; j < n_opts; j++) { + len = sizeof(int); + if (getsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, &len) == -1) { + pr_perror("getsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + + if (val != opts[j].val) { + fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, + sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); + goto close; + } + } + } + + pass(); + exit_code = 0; +close: + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) + close(sk_confs[i].sk); + return exit_code; +} diff --git a/test/zdtm/static/sock_tcp_opts00.desc b/test/zdtm/static/sock_tcp_opts00.desc new file mode 100644 index 0000000000..2eac7e654b --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/test/zdtm/static/sock_tcp_opts01.c b/test/zdtm/static/sock_tcp_opts01.c new file mode 120000 index 0000000000..5219c2e989 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts01.c @@ -0,0 +1 @@ +./sock_tcp_opts00.c \ No newline at end of file diff --git a/test/zdtm/static/sock_tcp_opts01.desc b/test/zdtm/static/sock_tcp_opts01.desc new file mode 120000 index 0000000000..fb1dfdcd13 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts01.desc @@ -0,0 +1 @@ +./sock_tcp_opts00.desc \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-fin-wait1.hook b/test/zdtm/static/socket-tcp-fin-wait1.hook index 9504557dad..30f8ce0710 100755 --- a/test/zdtm/static/socket-tcp-fin-wait1.hook +++ b/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -1,7 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys -sys.path.append("../crit") +sys.path.append("../lib") import pycriu import os, os.path diff --git a/test/zdtm/static/socket-tcp-nfconntrack.c b/test/zdtm/static/socket-tcp-ipt-nfconntrack.c similarity index 100% rename from test/zdtm/static/socket-tcp-nfconntrack.c rename to test/zdtm/static/socket-tcp-ipt-nfconntrack.c diff --git a/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc new file mode 100644 index 0000000000..53dd822854 --- /dev/null +++ b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'has_ipt_legacy', + 'flavor': 'h', + 'opts': '--tcp-established', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp-nfconntrack.desc b/test/zdtm/static/socket-tcp-nfconntrack.desc deleted file mode 100644 index add2513f81..0000000000 --- a/test/zdtm/static/socket-tcp-nfconntrack.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.c b/test/zdtm/static/socket-tcp-nft-nfconntrack.c new file mode 120000 index 0000000000..8cb60dd03a --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.desc b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc new file mode 100644 index 0000000000..38a4eb3897 --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc @@ -0,0 +1,7 @@ +{ + 'flavor': 'h', + 'feature': 'network_lock_nftables', + 'opts': '--tcp-established', + 'dopts': '--network-lock nftables', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index f6ef473853..bc20754963 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -67,17 +67,38 @@ int main(int argc, char **argv) int val; socklen_t optlen; -#ifdef ZDTM_CONNTRACK +#ifdef ZDTM_IPT_CONNTRACK if (unshare(CLONE_NEWNET)) { pr_perror("unshare"); return 1; } if (system("ip link set up dev lo")) return 1; - if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + + if (system("iptables-legacy -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + return 1; + if (system("iptables-legacy -w -A INPUT -j DROP")) + return 1; + +#endif + +#ifdef ZDTM_NFT_CONNTRACK + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); return 1; - if (system("iptables -w -A INPUT -j DROP")) + } + if (system("ip link set up dev lo")) + return 1; + + if (system("nft add table ip filter")) return 1; + if (system("nft 'add chain ip filter INPUT { type filter hook input priority 0 ; }'")) + return 1; + if (system("nft add rule ip filter INPUT iifname \"lo\" ip protocol tcp ct state new,established counter accept")) + return 1; + if (system("nft add rule ip filter INPUT counter drop")) + return 1; + #endif #ifdef ZDTM_TCP_LOCAL diff --git a/test/zdtm/static/socket_udp_shutdown.c b/test/zdtm/static/socket_udp_shutdown.c index 91dc8f30a4..a7658b9dd7 100644 --- a/test/zdtm/static/socket_udp_shutdown.c +++ b/test/zdtm/static/socket_udp_shutdown.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) test_init(argc, argv); - sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); - sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + sk1 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); + sk2 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); if (sk1 < 0 || sk2 < 0) { pr_perror("Can't create socket"); exit(1); diff --git a/test/zdtm/static/stopped.c b/test/zdtm/static/stopped.c index 059a2a92aa..26b0174eda 100644 --- a/test/zdtm/static/stopped.c +++ b/test/zdtm/static/stopped.c @@ -65,7 +65,7 @@ int main(int argc, char **argv) } if (WIFSTOPPED(status)) - test_msg("The procces stopped\n"); + test_msg("The process stopped\n"); else { fail("The process doesn't stopped"); goto out; diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index ab88120c2c..55609f2605 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -17,6 +17,7 @@ int main(int argc, char **argv) unsigned long orig_flags = 0, new_flags = 0; unsigned long orig_madv = 0, new_madv = 0; void *area; + int ret; test_init(argc, argv); @@ -35,9 +36,46 @@ int main(int argc, char **argv) return -1; } + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); + return -1; + } + + test_msg("Fetch pre-migration flags/adv\n"); + if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) + return -1; + + errno = 0; + if (orig_flags != new_flags) { + fail("Flags changed %lx -> %lx", orig_flags, new_flags); + return -1; + } + + if (orig_madv != new_madv) { + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); + return -1; + } + test_daemon(); test_waitsig(); + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting post-migration THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); + return -1; + } + if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { pr_perror("Enabling THP failed"); return -1; @@ -47,15 +85,14 @@ int main(int argc, char **argv) if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) return -1; + errno = 0; if (orig_flags != new_flags) { - pr_err("Flags are changed %lx -> %lx\n", orig_flags, new_flags); - fail(); + fail("Flags changed %lx -> %lx", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - pr_err("Madvs are changed %lx -> %lx\n", orig_madv, new_madv); - fail(); + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); return -1; } diff --git a/test/zdtm/static/thread_different_uid_gid.c b/test/zdtm/static/thread_different_uid_gid.c index 3a0b6291b1..88f99659b3 100644 --- a/test/zdtm/static/thread_different_uid_gid.c +++ b/test/zdtm/static/thread_different_uid_gid.c @@ -130,7 +130,7 @@ int main(int argc, char **argv) ret = syscall(SYS_setresgid, maingroup, maingroup, maingroup); if (ret >= 0) { ret = syscall(SYS_setresuid, mainuser, mainuser, mainuser); - } else if (ret < 0) { + } else { pr_perror("Failed to drop privileges"); exit(1); } diff --git a/test/zdtm/static/vdso00.c b/test/zdtm/static/vdso00.c index a9bef4dbd2..69123a2032 100644 --- a/test/zdtm/static/vdso00.c +++ b/test/zdtm/static/vdso00.c @@ -1,6 +1,6 @@ #include #include - +#include #include #include @@ -19,14 +19,14 @@ int main(int argc, char *argv[]) test_msg("%s pid %d\n", argv[0], getpid()); gettimeofday(&tv, &tz); - test_msg("%d time: %10li\n", getpid(), tv.tv_sec); + test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); test_daemon(); test_waitsig(); /* this call will fail if vDSO is corrupted */ gettimeofday(&tv, &tz); - test_msg("%d time: %10li\n", getpid(), tv.tv_sec); + test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); pass(); diff --git a/test/zdtm/static/vdso01.c b/test/zdtm/static/vdso01.c index d8d64155ad..d8b3c94d5e 100644 --- a/test/zdtm/static/vdso01.c +++ b/test/zdtm/static/vdso01.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -324,7 +325,8 @@ static int vdso_clock_gettime_handler(void *func) clock_gettime(CLOCK_REALTIME, &ts1); vdso_clock_gettime(CLOCK_REALTIME, &ts2); - test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); + test_msg("clock_gettime: tv_sec %" PRId64 " vdso_clock_gettime: tv_sec %" PRId64 "\n", + (int64_t)ts1.tv_sec, (int64_t)ts2.tv_sec); if (labs(ts1.tv_sec - ts2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); @@ -354,7 +356,8 @@ static int vdso_gettimeofday_handler(void *func) gettimeofday(&tv1, &tz); vdso_gettimeofday(&tv2, &tz); - test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); + test_msg("gettimeofday: tv_sec %" PRId64 " vdso_gettimeofday: tv_sec %" PRId64 "\n", + (int64_t)tv1.tv_sec, (int64_t)tv2.tv_sec); if (labs(tv1.tv_sec - tv2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); @@ -372,7 +375,7 @@ static int vdso_time_handler(void *func) t1 = time(NULL); t2 = vdso_time(NULL); - test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t1); + test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t2); if (labs(t1 - t2) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); diff --git a/test/zdtm/transition/ipc.c b/test/zdtm/transition/ipc.c index 0f16dbc685..7660f70af0 100644 --- a/test/zdtm/transition/ipc.c +++ b/test/zdtm/transition/ipc.c @@ -178,7 +178,7 @@ int main(int argc, char **argv) pr_perror("Child 2 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; - pr_perror("Child 2 couldn't inititalise"); + pr_perror("Child 2 couldn't initialise"); } out_child: kill(pid1, SIGTERM); @@ -188,7 +188,7 @@ int main(int argc, char **argv) pr_perror("Child 1 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; - pr_perror("Child 1 couldn't inititalise"); + pr_perror("Child 1 couldn't initialise"); } out_shdt: shmdt(mem); diff --git a/test/zdtm/transition/lazy-thp.c b/test/zdtm/transition/lazy-thp.c index 2bf99dc4ca..2e9722b963 100644 --- a/test/zdtm/transition/lazy-thp.c +++ b/test/zdtm/transition/lazy-thp.c @@ -25,7 +25,7 @@ int main(int argc, char **argv) test_init(argc, argv); - /* we presume that malloc returns not page aliged address */ + /* we presume that malloc returns not page aligned address */ mem = malloc(PAGE_SIZE * N_PAGES); org = malloc(PAGE_SIZE); if (!mem || !org) { diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c index 0fbcc2dca0..08a7a8e1a6 100644 --- a/test/zdtm/transition/rseq01.c +++ b/test/zdtm/transition/rseq01.c @@ -33,7 +33,10 @@ static inline void *thread_pointer(void) static inline void unregister_old_rseq(void) { /* unregister rseq */ - syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); + unsigned int size = __rseq_size; + if (__rseq_size < 32) + size = 32; + syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), size, 1, RSEQ_SIG); } #else static inline void unregister_old_rseq(void) diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 5e849b904b..44316893da 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -102,7 +102,7 @@ int main(int argc, char **argv) /* * pidns is used to avoid conflicts * mntns is used to mount /proc - * net is used to avoid conflicts of parasite sockets + * net is used to avoid conflicts between network tests */ if (!uid) if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC))