From 823db1bfccf6463225af76a09748d3691405e0f9 Mon Sep 17 00:00:00 2001 From: prakritigoyal19 Date: Sun, 7 Jun 2020 09:22:53 +0530 Subject: [PATCH 001/321] Add flog to CRIU Change made through this commit: - Include copy of flog as a seperate tree. - Modify the makefile to add and compile flog code. Signed-off-by: prakritigoyal19 --- Makefile | 12 ++- flog/Makefile | 29 ++++++ flog/built-in.S | 4 + flog/include/compiler.h | 71 +++++++++++++ flog/include/flog.h | 9 ++ flog/include/log.h | 17 ++++ flog/include/types.h | 16 +++ flog/include/uapi/flog.h | 149 +++++++++++++++++++++++++++ flog/include/util.h | 37 +++++++ flog/src/Makefile | 5 + flog/src/flog.c | 215 +++++++++++++++++++++++++++++++++++++++ flog/src/main.c | 170 +++++++++++++++++++++++++++++++ flog/tests/test00 | 22 ++++ 13 files changed, 755 insertions(+), 1 deletion(-) create mode 100644 flog/Makefile create mode 100644 flog/built-in.S create mode 100644 flog/include/compiler.h create mode 100644 flog/include/flog.h create mode 100644 flog/include/log.h create mode 100644 flog/include/types.h create mode 100644 flog/include/uapi/flog.h create mode 100644 flog/include/util.h create mode 100644 flog/src/Makefile create mode 100644 flog/src/flog.c create mode 100644 flog/src/main.c create mode 100755 flog/tests/test00 diff --git a/Makefile b/Makefile index 8061a42c45..377c6a3b5f 100644 --- a/Makefile +++ b/Makefile @@ -156,7 +156,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: flog criu lib crit .PHONY: all # @@ -242,6 +242,15 @@ soccr/built-in.o: $(CONFIG_HEADER) .FORCE $(SOCCR_A): |soccr/built-in.o criu-deps += $(SOCCR_A) +#flog gets used by criu, build it earlier + +flogMakefile: ; +flog%: + $(Q) $(MAKE) $(build)=flog $@ +flog: + $(Q) $(MAKE) $(build)=flog all +.PHONY: flog + # # CRIU building done in own directory # with slightly different rules so we @@ -284,6 +293,7 @@ lib: crit clean mrproper: $(Q) $(MAKE) $(build)=images $@ + $(Q) $(MAKE) $(build)=flog $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ diff --git a/flog/Makefile b/flog/Makefile new file mode 100644 index 0000000000..12255af719 --- /dev/null +++ b/flog/Makefile @@ -0,0 +1,29 @@ +OPTS=-ggdb3 -Wall -Werror +export OPTS + +CFLAGS += -iquote include +CFLAGS += -iquote flog/include +CFLAGS += -iquote flog/include/uapi + +include $(__nmk_dir)msg.mk + +$(eval $(call gen-built-in,src)) + +flog: + $(Q) $(MAKE) $(build)=$(obj)/src all +.PHONY: flog + +clean-flog: + $(call msg-gen, $@) + $(Q) $(MAKE) $(build)=$(obj)/src clean + $(Q) $(RM) built-in.o +.PHONY: clean-flog + +clean: clean-flog +mrproper: clean + +test: + ./tests/test00 + +all-y += flog + diff --git a/flog/built-in.S b/flog/built-in.S new file mode 100644 index 0000000000..26627d0544 --- /dev/null +++ b/flog/built-in.S @@ -0,0 +1,4 @@ +SECTIONS +{ + .rodata : { _rodata_start = . ; *(.rodata*) ; _rodata_end = . ;} +} diff --git a/flog/include/compiler.h b/flog/include/compiler.h new file mode 100644 index 0000000000..3e56eb0e64 --- /dev/null +++ b/flog/include/compiler.h @@ -0,0 +1,71 @@ +#ifndef __COMPILER_H__ +#define __COMPILER_H__ + +/* + * Various definitions for success build, + * picked from various places, mostly from + * the linux kernel. + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +#define __section(S) __attribute__ ((__section__(#S))) + +#ifndef __always_inline +# define __always_inline inline __attribute__((always_inline)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef always_inline +# define always_inline __always_inline +#endif + +#ifndef noinline +# define noinline __attribute__((noinline)) +#endif + +#define __aligned(x) __attribute__((aligned(x))) + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) +#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define is_log2(v) (((v) & ((v) - 1)) == 0) + +#endif /* __COMPILER_H__ */ diff --git a/flog/include/flog.h b/flog/include/flog.h new file mode 100644 index 0000000000..f00c20541f --- /dev/null +++ b/flog/include/flog.h @@ -0,0 +1,9 @@ +#ifndef __FLOG_H__ +#define __FLOG_H__ + +#include +#include + +#include "uapi/flog.h" + +#endif /* __FLOG_H__ */ diff --git a/flog/include/log.h b/flog/include/log.h new file mode 100644 index 0000000000..1a165ea9fb --- /dev/null +++ b/flog/include/log.h @@ -0,0 +1,17 @@ +#ifndef __LOG_H__ +#define __LOG_H__ + +#include + +#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) + +#if 1 +# define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#else +# define pr_debug(fmt, ...) +#endif + +#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): "fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): "fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) + +#endif /* __LOG_H__ */ diff --git a/flog/include/types.h b/flog/include/types.h new file mode 100644 index 0000000000..0e15bfbff5 --- /dev/null +++ b/flog/include/types.h @@ -0,0 +1,16 @@ +#ifndef __FLOG_TYPES_H__ +#define __FLOG_TYPES_H__ + +#include +#include + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +#endif /* __FLOG_TYPES_H__ */ diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h new file mode 100644 index 0000000000..2d879110fc --- /dev/null +++ b/flog/include/uapi/flog.h @@ -0,0 +1,149 @@ +#ifndef __UAPI_FLOG_H__ +#define __UAPI_FLOG_H__ + +#include +#include +#include + +/* + * We work with up to 32 arguments in macros here. + * If more provided -- behaviour is undefined. + */ + +/* + * By Laurent Deniau at https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s + */ +#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) +#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) + +#define FLOG_PP_ARG_N( _0, _1, _2, _3, _4, \ + _5, _6, _7, _8, _9, \ + _10,_11,_12,_13,_14, \ + _15,_16,_17,_18,_19, \ + _20,_21,_22,_23,_24, \ + _25,_26,_27,_28,_29, \ + _30,_31, N, ...) N + +#define FLOG_PP_RSEQ_N() \ + 31, 30, 29, 28, 27, \ + 26, 25, 24, 23, 22, \ + 21, 20, 19, 18, 17, \ + 16, 15, 14, 13, 12, \ + 11, 10, 9, 8, 7, \ + 6, 5, 4, 3, 2, \ + 1, 0 + +#define FLOG_GENMASK_0(N, x) 0 +#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) +#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) + +#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) +#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) +#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 + +#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) +#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) + +#define flog_genbit(ord, n, v, ...) \ + _Generic((v), \ + \ + /* Basic types */ \ + char: 0, \ + signed char: 0, \ + unsigned char: 0, \ + signed short int: 0, \ + unsigned short int: 0, \ + signed int: 0, \ + unsigned int: 0, \ + signed long: 0, \ + unsigned long: 0, \ + signed long long: 0, \ + unsigned long long: 0, \ + \ + /* Not used for a while */ \ + /* float: 12, */ \ + /* double: 13, */ \ + /* long double: 14, */ \ + \ + /* Basic poniters */ \ + char *: (1u << (ord - n - 1)), \ + signed char *: (1u << (ord - n - 1)), \ + unsigned char *: (1u << (ord - n - 1)), \ + signed short int *: 0, \ + unsigned short int *: 0, \ + signed int *: 0, \ + unsigned int *: 0, \ + signed long *: 0, \ + unsigned long *: 0, \ + signed long long *: 0, \ + unsigned long long *: 0, \ + void *: 0, \ + \ + /* Const basic pointers */ \ + const char *: (1u << (ord - n - 1)), \ + const signed char *: (1u << (ord - n - 1)), \ + const unsigned char *: (1u << (ord - n - 1)), \ + const signed short int *: 0, \ + const unsigned short int *: 0, \ + const signed int *: 0, \ + const unsigned int *: 0, \ + const signed long *: 0, \ + const unsigned long *: 0, \ + const signed long long *: 0, \ + const unsigned long long *: 0, \ + const void *: 0, \ + \ + /* Systypes and pointers */ \ + default: -1) + +typedef struct { + unsigned int magic; + unsigned int size; + unsigned int nargs; + unsigned int mask; + long fmt; + long args[0]; +} flog_msg_t; + +extern int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...); +void flog_decode_msg(int fdout, const char *format, ...); +extern int flog_decode_all(int fdin, int fdout); + +#define flog_encode(fdout, fmt, ...) \ + flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), \ + FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) + +int flog_map_buf(int fdout); +int flog_close(int fdout); + +#endif /* __UAPI_FLOG_H__ */ diff --git a/flog/include/util.h b/flog/include/util.h new file mode 100644 index 0000000000..17a4d77997 --- /dev/null +++ b/flog/include/util.h @@ -0,0 +1,37 @@ +#ifndef __UTIL_H__ +#define __UTIL_H__ + +#include +#include + +#include "log.h" +#include "types.h" + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) do { if (p) free(p); } while (0) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -ENOMEM; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +#endif /* __UTIL_H__ */ diff --git a/flog/src/Makefile b/flog/src/Makefile new file mode 100644 index 0000000000..ee73ea7252 --- /dev/null +++ b/flog/src/Makefile @@ -0,0 +1,5 @@ +ccflags-y += -DCONFIG_X86_64 -iquote ./include $(OPTS) +ldflags-y += -r + +#obj-y += main.o +obj-y += flog.o diff --git a/flog/src/flog.c b/flog/src/flog.c new file mode 100644 index 0000000000..533625de61 --- /dev/null +++ b/flog/src/flog.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include + +//#include + +#include "uapi/flog.h" +#include "util.h" + +#define MAGIC 0xABCDABCD + +#define BUF_SIZE (1<<20) +static char _mbuf[BUF_SIZE]; +static char *mbuf = _mbuf; +static char *fbuf; +static uint64_t fsize; +static uint64_t mbuf_size = sizeof(_mbuf); + +/*int flog_decode_all(int fdin, int fdout) +{ + flog_msg_t *m = (void *)mbuf; + ffi_type *args[34] = { + [0] = &ffi_type_sint, + [1] = &ffi_type_pointer, + [2 ... 33] = &ffi_type_slong + }; + void *values[34]; + ffi_cif cif; + ffi_arg rc; + size_t i, ret; + char *fmt; + + values[0] = (void *)&fdout; + + while (1) { + ret = read(fdin, mbuf, sizeof(m)); + if (ret == 0) + break; + if (ret < 0) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + if (m->magic != MAGIC) { + fprintf(stderr, "The log file was not properly closed\n"); + break; + } + ret = m->size - sizeof(m); + if (m->size > mbuf_size) { + fprintf(stderr, "The buffer is too small"); + return -1; + } + if (read(fdin, mbuf + sizeof(m), ret) != ret) { + fprintf(stderr, "Unable to read a message: %m"); + return -1; + } + + fmt = mbuf + m->fmt; + values[1] = &fmt; + + for (i = 0; i < m->nargs; i++) { + values[i + 2] = (void *)&m->args[i]; + if (m->mask & (1u << i)) { + m->args[i] = (long)(mbuf + m->args[i]); + } + } + + if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, m->nargs + 2, + &ffi_type_sint, args) == FFI_OK) + ffi_call(&cif, FFI_FN(dprintf), &rc, values); + } + return 0; +}*/ + +static int flog_enqueue(flog_msg_t *m) +{ + if (write(1, m, m->size) != m->size) { + fprintf(stderr, "Unable to write a message\n"); + return -1; + } + return 0; +} + +/*extern char *rodata_start; +extern char *rodata_end; +*/ +/* Pre-allocate a buffer in a file and map it into memory. */ +int flog_map_buf(int fdout) +{ + uint64_t off = 0; + void *addr; + + /* + * Two buffers are mmaped into memory. A new one is mapped when a first + * one is completly filled. + */ + if (fbuf && (mbuf - fbuf < BUF_SIZE)) + return 0; + + if (fbuf) { + if (munmap(fbuf, BUF_SIZE * 2)) { + fprintf(stderr, "Unable to unmap a buffer: %m"); + return -1; + } + off = mbuf - fbuf - BUF_SIZE; + fbuf = NULL; + } + + if (fsize == 0) + fsize += BUF_SIZE; + fsize += BUF_SIZE; + + if (ftruncate(fdout, fsize)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + + if (!fbuf) + addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, + MAP_FILE | MAP_SHARED, fdout, fsize - 2 * BUF_SIZE); + else + addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, + BUF_SIZE * 2, MREMAP_FIXED, fbuf); + if (addr == MAP_FAILED) { + fprintf(stderr, "Unable to map a buffer: %m"); + return -1; + } + + fbuf = addr; + mbuf = fbuf + off; + mbuf_size = 2 * BUF_SIZE; + + return 0; +} + +int flog_close(int fdout) +{ + if (mbuf == _mbuf) + return 0; + + munmap(fbuf, BUF_SIZE * 2); + + if (ftruncate(fdout, fsize - 2 * BUF_SIZE + mbuf - fbuf)) { + fprintf(stderr, "Unable to truncate a file: %m"); + return -1; + } + return 0; +} + +int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...) +{ + flog_msg_t *m; + va_list argptr; + char *str_start, *p; + size_t i; + + if (mbuf != _mbuf && flog_map_buf(fdout)) + return -1; + + m = (void *) mbuf; + + m->nargs = nargs; + m->mask = mask; + + str_start = (void *)m->args + sizeof(m->args[0]) * nargs; + p = memccpy(str_start, format, 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + return -1; + } + m->fmt = str_start - mbuf; + str_start = p; + + va_start(argptr, format); + for (i = 0; i < nargs; i++) { + m->args[i] = (long)va_arg(argptr, long); + /* + * If we got a string, we should either + * reference it when in rodata, or make + * a copy (FIXME implement rodata refs). + */ + if (mask & (1u << i)) { + p = memccpy(str_start, (void *)m->args[i], 0, mbuf_size - (str_start - mbuf)); + if (p == NULL) { + fprintf(stderr, "No memory for string argument\n"); + return -1; + } + m->args[i] = str_start - mbuf; + str_start = p; + } + } + va_end(argptr); + m->size = str_start - mbuf; + + /* + * A magic is required to know where we stop writing into a log file, + * if it was not properly closed. The file is mapped into memory, so a + * space in the file is allocated in advance and at the end it can have + * some unused tail. + */ + m->magic = MAGIC; + + m->size = roundup(m->size, 8); + if (mbuf == _mbuf) { + if (flog_enqueue(m)) + return -1; + } else { + mbuf += m->size; + mbuf_size -= m->size; + } + return 0; +} diff --git a/flog/src/main.c b/flog/src/main.c new file mode 100644 index 0000000000..c84e774781 --- /dev/null +++ b/flog/src/main.c @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "flog.h" + +extern char _rodata_start, _rodata_end; +char *rodata_start = &_rodata_start; +char *rodata_end = &_rodata_end; + +enum { + MODE_BINARY, + MODE_FPRINTF, + MODE_SPRINTF, + MODE_DPRINTF, +}; + +int main(int argc, char *argv[]) +{ + static const char str1[] = "String1 String1"; + static const char str2[] = "string2 string2 string2"; + int fdout = STDOUT_FILENO; + bool use_decoder = false; + int mode = MODE_BINARY; + size_t niter = 100; + int opt, idx; + size_t i; + + static const char short_opts[] = "m:o:di:h"; + static struct option long_opts[] = { + { "mode", required_argument, 0, 'm' }, + { "output", required_argument, 0, 'o' }, + { "decode", no_argument, 0, 'd' }, + { "iter", required_argument, 0, 'i' }, + { "help", no_argument, 0, 'h' }, + { }, + }; + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + + switch (opt) { + case 'm': + if (strcmp(optarg, "binary") == 0) { + mode = MODE_BINARY; + } else if (strcmp(optarg, "fprintf") == 0) { + mode = MODE_FPRINTF; + } else if (strcmp(optarg, "sprintf") == 0) { + mode = MODE_SPRINTF; + } else if (strcmp(optarg, "dprintf") == 0) { + mode = MODE_DPRINTF; + } else + goto usage; + break; + case 'o': + if (strcmp(optarg, "stdout") == 0) { + fdout = fileno(stdout); + } else if (strcmp(optarg, "stderr") == 0) { + fdout = fileno(stderr); + } else { + fdout = open(optarg, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fdout < 0) { + fprintf(stderr, "Can't open %s: %s\n", + optarg, strerror(errno)); + exit(1); + } + } + break; + case 'i': + niter = atoi(optarg); + break; + case 'd': + use_decoder = true; + break; + case 'h': + default: + goto usage; + } + } + + switch (mode) { + case MODE_BINARY: + if (use_decoder) + return flog_decode_all(STDIN_FILENO, fdout); + + if (fdout != STDOUT_FILENO && flog_map_buf(fdout)) + return 1; + for (i = 0; i < niter; i++) + if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", + str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2)) + return 1; + if (flog_close(fdout)) + return 1; + break; + case MODE_DPRINTF: + { + for (i = 0; i < niter; i++) { + dprintf(fdout, "Some message %s %s %c %li %d %lu\n", + str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + case MODE_FPRINTF: + { + FILE *f = fdopen(fdout, "w"); + + for (i = 0; i < niter; i++) { + fprintf(f, "Some message %s %s %c %li %d %lu\n", + str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + fflush(f); + } + fclose(f); + break; + } + case MODE_SPRINTF: + { + static char buf[4096]; + + for (i = 0; i < niter; i++) { + sprintf(buf, "Some message %s %s %c %li %d %lu\n", + str1, str2, 'c', (long)-4, (short)2, + (unsigned long)2); + } + break; + } + default: + return 1; + } + + return 0; +usage: + fprintf(stderr, + "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" + "\n" + + "Examples:\n" + "\n" + + " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" + "\n" + " flog -m dprintf -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode without processing (queue messages only)\n" + "\n" + " flog -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after\n" + "\n" + " flog -i 100000 -d\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after, writting results into 'out' file\n" + "\n" + " flog -i 100000 -d -o out\n" + "\n"); + return 1; +} diff --git a/flog/tests/test00 b/flog/tests/test00 new file mode 100755 index 0000000000..a7937e4a18 --- /dev/null +++ b/flog/tests/test00 @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e -x + +echo Map a log file into memory +time ./flog run -i 1000000 -o /tmp/flog.raw.map +echo Write into a log file +time ./flog run -i 1000000 > /tmp/flog.raw +echo Use fprintf +time ./flog run -m fprintf -i 1000000 -o /tmp/flog.fprintf.txt +echo Use dprintf +time ./flog run -m dprintf -i 1000000 -o /tmp/flog.dprintf.txt +echo Use sprintf +time ./flog run -m sprintf -i 1000000 + +time ./flog run -d < /tmp/flog.raw > /tmp/flog.raw.txt +cmp /tmp/flog.raw.txt /tmp/flog.fprintf.txt + +time ./flog run -d < /tmp/flog.raw.map > /tmp/flog.raw.map.txt +cmp /tmp/flog.raw.map.txt /tmp/flog.fprintf.txt + +cmp /tmp/flog.dprintf.txt /tmp/flog.fprintf.txt From 1444b72d25c754fa9508525826f26b7a8d3662cd Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 28 Sep 2020 07:04:00 +0000 Subject: [PATCH 002/321] flog: Missing varargs init or cleanup (VARARGS) CID 302713 (#1 of 1): Missing varargs init or cleanup (VARARGS) va_end was not called for argptr. Signed-off-by: Adrian Reber --- flog/src/flog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/flog/src/flog.c b/flog/src/flog.c index 533625de61..40cce3fedc 100644 --- a/flog/src/flog.c +++ b/flog/src/flog.c @@ -186,6 +186,7 @@ int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char p = memccpy(str_start, (void *)m->args[i], 0, mbuf_size - (str_start - mbuf)); if (p == NULL) { fprintf(stderr, "No memory for string argument\n"); + va_end(argptr); return -1; } m->args[i] = str_start - mbuf; From cc193dc669088b2a561b232ff98ea17ed21b80ff Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 4 Aug 2021 07:27:07 +0000 Subject: [PATCH 003/321] Run 'make indent' on 'flog/' Separate commit for easier criu-dev <-> master transfer. Acked-by: Mike Rapoport Signed-off-by: Adrian Reber --- flog/include/compiler.h | 88 +++++++++++++++------------- flog/include/log.h | 10 ++-- flog/include/types.h | 16 +++--- flog/include/uapi/flog.h | 120 ++++++++++++++++++--------------------- flog/include/util.h | 52 +++++++++-------- flog/src/flog.c | 11 ++-- flog/src/main.c | 37 +++++------- 7 files changed, 161 insertions(+), 173 deletions(-) diff --git a/flog/include/compiler.h b/flog/include/compiler.h index 3e56eb0e64..80264ec631 100644 --- a/flog/include/compiler.h +++ b/flog/include/compiler.h @@ -8,64 +8,70 @@ */ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)])) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) -#define NORETURN __attribute__((__noreturn__)) -#define __packed __attribute__((__packed__)) -#define __used __attribute__((__used__)) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) -#define __section(S) __attribute__ ((__section__(#S))) +#define __section(S) __attribute__((__section__(#S))) #ifndef __always_inline -# define __always_inline inline __attribute__((always_inline)) +#define __always_inline inline __attribute__((always_inline)) #endif -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) #ifndef always_inline -# define always_inline __always_inline +#define always_inline __always_inline #endif #ifndef noinline -# define noinline __attribute__((noinline)) +#define noinline __attribute__((noinline)) #endif -#define __aligned(x) __attribute__((aligned(x))) +#define __aligned(x) __attribute__((aligned(x))) #ifndef offsetof -# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER) #endif -#define barrier() asm volatile("" ::: "memory") - -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) - -#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) -#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) -#define round_down(x, y) ((x) & ~__round_mask(x, y)) -#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) -#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) - -#define min(x, y) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - (void) (&_min1 == &_min2); \ - _min1 < _min2 ? _min1 : _min2; }) - -#define max(x, y) ({ \ - typeof(x) _max1 = (x); \ - typeof(y) _max2 = (y); \ - (void) (&_max1 == &_max2); \ - _max1 > _max2 ? _max1 : _max2; }) - -#define is_log2(v) (((v) & ((v) - 1)) == 0) +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) + +#define min(x, y) \ + ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void)(&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; \ + }) + +#define max(x, y) \ + ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void)(&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; \ + }) + +#define is_log2(v) (((v) & ((v)-1)) == 0) #endif /* __COMPILER_H__ */ diff --git a/flog/include/log.h b/flog/include/log.h index 1a165ea9fb..8aafe44b75 100644 --- a/flog/include/log.h +++ b/flog/include/log.h @@ -3,15 +3,15 @@ #include -#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) +#define pr_out(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__) #if 1 -# define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) #else -# define pr_debug(fmt, ...) +#define pr_debug(fmt, ...) #endif -#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): "fmt, __FILE__, __LINE__, ##__VA_ARGS__) -#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): "fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_err(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_perror(fmt, ...) fprintf(stderr, "Error (%s:%d): " fmt "%m\n", __FILE__, __LINE__, ##__VA_ARGS__) #endif /* __LOG_H__ */ diff --git a/flog/include/types.h b/flog/include/types.h index 0e15bfbff5..07c992968b 100644 --- a/flog/include/types.h +++ b/flog/include/types.h @@ -4,13 +4,13 @@ #include #include -typedef uint64_t u64; -typedef int64_t s64; -typedef uint32_t u32; -typedef int32_t s32; -typedef uint16_t u16; -typedef int16_t s16; -typedef uint8_t u8; -typedef int8_t s8; +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; #endif /* __FLOG_TYPES_H__ */ diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h index 2d879110fc..6061f4556a 100644 --- a/flog/include/uapi/flog.h +++ b/flog/include/uapi/flog.h @@ -13,68 +13,59 @@ /* * By Laurent Deniau at https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s */ -#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) -#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) +#define FLOG_PP_NARG_(...) FLOG_PP_ARG_N(__VA_ARGS__) +#define FLOG_PP_NARG(...) FLOG_PP_NARG_(1, ##__VA_ARGS__, FLOG_PP_RSEQ_N()) -#define FLOG_PP_ARG_N( _0, _1, _2, _3, _4, \ - _5, _6, _7, _8, _9, \ - _10,_11,_12,_13,_14, \ - _15,_16,_17,_18,_19, \ - _20,_21,_22,_23,_24, \ - _25,_26,_27,_28,_29, \ - _30,_31, N, ...) N +#define FLOG_PP_ARG_N(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, \ + _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, N, ...) \ + N -#define FLOG_PP_RSEQ_N() \ - 31, 30, 29, 28, 27, \ - 26, 25, 24, 23, 22, \ - 21, 20, 19, 18, 17, \ - 16, 15, 14, 13, 12, \ - 11, 10, 9, 8, 7, \ - 6, 5, 4, 3, 2, \ - 1, 0 +#define FLOG_PP_RSEQ_N() \ + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, \ + 2, 1, 0 -#define FLOG_GENMASK_0(N, x) 0 -#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) -#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) -#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_0(N, x) 0 +#define FLOG_GENMASK_1(N, op, x, ...) (op(N, 0, x)) +#define FLOG_GENMASK_2(N, op, x, ...) ((op(N, 1, x)) | FLOG_GENMASK_1(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_3(N, op, x, ...) ((op(N, 2, x)) | FLOG_GENMASK_2(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_4(N, op, x, ...) ((op(N, 3, x)) | FLOG_GENMASK_3(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_5(N, op, x, ...) ((op(N, 4, x)) | FLOG_GENMASK_4(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_6(N, op, x, ...) ((op(N, 5, x)) | FLOG_GENMASK_5(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_7(N, op, x, ...) ((op(N, 6, x)) | FLOG_GENMASK_6(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_8(N, op, x, ...) ((op(N, 7, x)) | FLOG_GENMASK_7(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_9(N, op, x, ...) ((op(N, 8, x)) | FLOG_GENMASK_8(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_10(N, op, x, ...) ((op(N, 9, x)) | FLOG_GENMASK_9(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_11(N, op, x, ...) ((op(N, 10, x)) | FLOG_GENMASK_10(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_12(N, op, x, ...) ((op(N, 11, x)) | FLOG_GENMASK_11(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_13(N, op, x, ...) ((op(N, 12, x)) | FLOG_GENMASK_12(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_14(N, op, x, ...) ((op(N, 13, x)) | FLOG_GENMASK_13(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_15(N, op, x, ...) ((op(N, 14, x)) | FLOG_GENMASK_14(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_16(N, op, x, ...) ((op(N, 15, x)) | FLOG_GENMASK_15(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_17(N, op, x, ...) ((op(N, 16, x)) | FLOG_GENMASK_16(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_18(N, op, x, ...) ((op(N, 17, x)) | FLOG_GENMASK_17(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_19(N, op, x, ...) ((op(N, 18, x)) | FLOG_GENMASK_18(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_20(N, op, x, ...) ((op(N, 19, x)) | FLOG_GENMASK_19(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_21(N, op, x, ...) ((op(N, 20, x)) | FLOG_GENMASK_20(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_22(N, op, x, ...) ((op(N, 21, x)) | FLOG_GENMASK_21(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_23(N, op, x, ...) ((op(N, 22, x)) | FLOG_GENMASK_22(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_24(N, op, x, ...) ((op(N, 23, x)) | FLOG_GENMASK_23(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_25(N, op, x, ...) ((op(N, 24, x)) | FLOG_GENMASK_24(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_26(N, op, x, ...) ((op(N, 25, x)) | FLOG_GENMASK_25(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_27(N, op, x, ...) ((op(N, 26, x)) | FLOG_GENMASK_26(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_28(N, op, x, ...) ((op(N, 27, x)) | FLOG_GENMASK_27(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_29(N, op, x, ...) ((op(N, 28, x)) | FLOG_GENMASK_28(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_30(N, op, x, ...) ((op(N, 29, x)) | FLOG_GENMASK_29(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_31(N, op, x, ...) ((op(N, 30, x)) | FLOG_GENMASK_30(N, op, __VA_ARGS__)) +#define FLOG_GENMASK_32(N, op, x, ...) ((op(N, 31, x)) | FLOG_GENMASK_31(N, op, __VA_ARGS__)) -#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) -#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) -#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 +#define FLOG_CONCAT(arg1, arg2) FLOG_CONCAT1(arg1, arg2) +#define FLOG_CONCAT1(arg1, arg2) FLOG_CONCAT2(arg1, arg2) +#define FLOG_CONCAT2(arg1, arg2) arg1##arg2 -#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) -#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) +#define FLOG_GENMASK_(N, op, ...) FLOG_CONCAT(FLOG_GENMASK_, N)(N, op, ##__VA_ARGS__) +#define FLOG_GENMASK(op, ...) FLOG_GENMASK_(FLOG_PP_NARG(__VA_ARGS__), op, ##__VA_ARGS__) -#define flog_genbit(ord, n, v, ...) \ +#define flog_genbit(ord, n, v, ...) \ _Generic((v), \ \ /* Basic types */ \ @@ -127,21 +118,20 @@ default: -1) typedef struct { - unsigned int magic; - unsigned int size; - unsigned int nargs; - unsigned int mask; - long fmt; - long args[0]; + unsigned int magic; + unsigned int size; + unsigned int nargs; + unsigned int mask; + long fmt; + long args[0]; } flog_msg_t; extern int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char *format, ...); void flog_decode_msg(int fdout, const char *format, ...); extern int flog_decode_all(int fdin, int fdout); -#define flog_encode(fdout, fmt, ...) \ - flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), \ - FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) +#define flog_encode(fdout, fmt, ...) \ + flog_encode_msg(fdout, FLOG_PP_NARG(__VA_ARGS__), FLOG_GENMASK(flog_genbit, ##__VA_ARGS__), fmt, ##__VA_ARGS__) int flog_map_buf(int fdout); int flog_close(int fdout); diff --git a/flog/include/util.h b/flog/include/util.h index 17a4d77997..7b1edb6885 100644 --- a/flog/include/util.h +++ b/flog/include/util.h @@ -7,31 +7,35 @@ #include "log.h" #include "types.h" -#define __xalloc(op, size, ...) \ - ({ \ - void *___p = op(__VA_ARGS__); \ - ___p; \ +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + ___p; \ }) -#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) -#define xmalloc(size) __xalloc(malloc, size, size) -#define xzalloc(size) __xalloc(calloc, size, 1, size) -#define xrealloc(p, size) __xalloc(realloc, size, p, size) - -#define xfree(p) do { if (p) free(p); } while (0) - -#define xrealloc_safe(pptr, size) \ - ({ \ - int __ret = -ENOMEM; \ - void *new = xrealloc(*pptr, size); \ - if (new) { \ - *pptr = new; \ - __ret = 0; \ - } \ - __ret; \ - }) - -#define memzero_p(p) memset(p, 0, sizeof(*p)) -#define memzero(p, size) memset(p, 0, size) +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) \ + do { \ + if (p) \ + free(p); \ + } while (0) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -ENOMEM; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) #endif /* __UTIL_H__ */ diff --git a/flog/src/flog.c b/flog/src/flog.c index 40cce3fedc..8f11a36cbf 100644 --- a/flog/src/flog.c +++ b/flog/src/flog.c @@ -13,7 +13,7 @@ #define MAGIC 0xABCDABCD -#define BUF_SIZE (1<<20) +#define BUF_SIZE (1 << 20) static char _mbuf[BUF_SIZE]; static char *mbuf = _mbuf; static char *fbuf; @@ -119,11 +119,10 @@ int flog_map_buf(int fdout) } if (!fbuf) - addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, - MAP_FILE | MAP_SHARED, fdout, fsize - 2 * BUF_SIZE); + addr = mmap(NULL, BUF_SIZE * 2, PROT_WRITE | PROT_READ, MAP_FILE | MAP_SHARED, fdout, + fsize - 2 * BUF_SIZE); else - addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, - BUF_SIZE * 2, MREMAP_FIXED, fbuf); + addr = mremap(fbuf + BUF_SIZE, BUF_SIZE, BUF_SIZE * 2, MREMAP_FIXED, fbuf); if (addr == MAP_FAILED) { fprintf(stderr, "Unable to map a buffer: %m"); return -1; @@ -160,7 +159,7 @@ int flog_encode_msg(int fdout, unsigned int nargs, unsigned int mask, const char if (mbuf != _mbuf && flog_map_buf(fdout)) return -1; - m = (void *) mbuf; + m = (void *)mbuf; m->nargs = nargs; m->mask = mask; diff --git a/flog/src/main.c b/flog/src/main.c index c84e774781..fc5d64ebd2 100644 --- a/flog/src/main.c +++ b/flog/src/main.c @@ -33,12 +33,9 @@ int main(int argc, char *argv[]) static const char short_opts[] = "m:o:di:h"; static struct option long_opts[] = { - { "mode", required_argument, 0, 'm' }, - { "output", required_argument, 0, 'o' }, - { "decode", no_argument, 0, 'd' }, - { "iter", required_argument, 0, 'i' }, - { "help", no_argument, 0, 'h' }, - { }, + { "mode", required_argument, 0, 'm' }, { "output", required_argument, 0, 'o' }, + { "decode", no_argument, 0, 'd' }, { "iter", required_argument, 0, 'i' }, + { "help", no_argument, 0, 'h' }, {}, }; while (1) { @@ -68,8 +65,7 @@ int main(int argc, char *argv[]) } else { fdout = open(optarg, O_RDWR | O_CREAT | O_TRUNC, 0644); if (fdout < 0) { - fprintf(stderr, "Can't open %s: %s\n", - optarg, strerror(errno)); + fprintf(stderr, "Can't open %s: %s\n", optarg, strerror(errno)); exit(1); } } @@ -94,42 +90,35 @@ int main(int argc, char *argv[]) if (fdout != STDOUT_FILENO && flog_map_buf(fdout)) return 1; for (i = 0; i < niter; i++) - if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", - str1, str2, 'c', (long)-4, (short)2, - (unsigned long)2)) + if (flog_encode(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, + (short)2, (unsigned long)2)) return 1; if (flog_close(fdout)) return 1; - break; - case MODE_DPRINTF: - { + break; + case MODE_DPRINTF: { for (i = 0; i < niter; i++) { - dprintf(fdout, "Some message %s %s %c %li %d %lu\n", - str1, str2, 'c', (long)-4, (short)2, + dprintf(fdout, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, (unsigned long)2); } break; } - case MODE_FPRINTF: - { + case MODE_FPRINTF: { FILE *f = fdopen(fdout, "w"); for (i = 0; i < niter; i++) { - fprintf(f, "Some message %s %s %c %li %d %lu\n", - str1, str2, 'c', (long)-4, (short)2, + fprintf(f, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, (unsigned long)2); fflush(f); } fclose(f); break; } - case MODE_SPRINTF: - { + case MODE_SPRINTF: { static char buf[4096]; for (i = 0; i < niter; i++) { - sprintf(buf, "Some message %s %s %c %li %d %lu\n", - str1, str2, 'c', (long)-4, (short)2, + sprintf(buf, "Some message %s %s %c %li %d %lu\n", str1, str2, 'c', (long)-4, (short)2, (unsigned long)2); } break; From 1fa21a0fafc5cc1a42476d4b666400a933d0b6cc Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 6 Apr 2022 17:45:57 -0700 Subject: [PATCH 004/321] flog: typo: mmaped -> mmapped It is mapped, not maped. Same applies for mmap I guess. Found by codespell, except it wants to change it to mapped, which will make it less specific. Signed-off-by: Kir Kolyshkin --- flog/src/flog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flog/src/flog.c b/flog/src/flog.c index 8f11a36cbf..f48b7f127e 100644 --- a/flog/src/flog.c +++ b/flog/src/flog.c @@ -94,7 +94,7 @@ int flog_map_buf(int fdout) void *addr; /* - * Two buffers are mmaped into memory. A new one is mapped when a first + * Two buffers are mmapped into memory. A new one is mapped when a first * one is completly filled. */ if (fbuf && (mbuf - fbuf < BUF_SIZE)) From 7eaf7a38f478e2f222a7267b544eeeb927ecc220 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 30 Mar 2022 18:45:16 -0700 Subject: [PATCH 005/321] flog: fix some codespell warnings Brought to you by codespell -w (using codespell v2.1.0). [v2: use "make indent" on the result] Signed-off-by: Kir Kolyshkin --- flog/src/flog.c | 2 +- flog/src/main.c | 51 ++++++++++++++++++++++++------------------------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/flog/src/flog.c b/flog/src/flog.c index f48b7f127e..d7660f18d8 100644 --- a/flog/src/flog.c +++ b/flog/src/flog.c @@ -95,7 +95,7 @@ int flog_map_buf(int fdout) /* * Two buffers are mmapped into memory. A new one is mapped when a first - * one is completly filled. + * one is completely filled. */ if (fbuf && (mbuf - fbuf < BUF_SIZE)) return 0; diff --git a/flog/src/main.c b/flog/src/main.c index fc5d64ebd2..e027917c68 100644 --- a/flog/src/main.c +++ b/flog/src/main.c @@ -129,31 +129,30 @@ int main(int argc, char *argv[]) return 0; usage: - fprintf(stderr, - "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" - "\n" - - "Examples:\n" - "\n" - - " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" - "\n" - " flog -m dprintf -i 100000\n" - "\n" - - " - run 100000 iterations in binary mode without processing (queue messages only)\n" - "\n" - " flog -i 100000\n" - "\n" - - " - run 100000 iterations in binary mode with decoding after\n" - "\n" - " flog -i 100000 -d\n" - "\n" - - " - run 100000 iterations in binary mode with decoding after, writting results into 'out' file\n" - "\n" - " flog -i 100000 -d -o out\n" - "\n"); + fprintf(stderr, "flog [--mode binary|dprintf] [--output stdout|stderr|filename] [--decode] [--iter number]\n" + "\n" + + "Examples:\n" + "\n" + + " - run 100000 iterations of instant message processing (immediate dprintf calls)\n" + "\n" + " flog -m dprintf -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode without processing (queue messages only)\n" + "\n" + " flog -i 100000\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after\n" + "\n" + " flog -i 100000 -d\n" + "\n" + + " - run 100000 iterations in binary mode with decoding after, writing results into 'out' file\n" + "\n" + " flog -i 100000 -d -o out\n" + "\n"); return 1; } From aed3f34c092e3cb2214514d198d5c0762b0c2b7a Mon Sep 17 00:00:00 2001 From: Pengda Yang Date: Wed, 15 Mar 2023 16:58:31 +0800 Subject: [PATCH 006/321] limit the field width of 'scanf' Fixes: #2121 Signed-off-by: Pengda Yang --- criu/proc_parse.c | 6 +++--- test/zdtm/lib/fs.c | 2 +- test/zdtm/static/apparmor.c | 2 +- test/zdtm/static/apparmor_stacking.c | 2 +- test/zdtm/static/cgroup01.c | 2 +- test/zdtm/static/cgroup02.c | 2 +- test/zdtm/static/change_mnt_context.c | 2 +- test/zdtm/static/file_locks01.c | 2 +- test/zdtm/static/file_locks02.c | 2 +- test/zdtm/static/file_locks03.c | 2 +- test/zdtm/static/file_locks04.c | 2 +- test/zdtm/static/netns-dev.c | 2 +- test/zdtm/static/ofd_file_locks.c | 2 +- 13 files changed, 15 insertions(+), 15 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 5e96b5c963..61c1eee240 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1407,7 +1407,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -2208,10 +2208,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index bf8cd9cd31..efcc7a1d08 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 713ffaa469..dc16368217 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 76de8b8b49..0bc36048cf 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index bc8515264d..7bfb677623 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 6229a8a089..8a925c0a43 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 6d436014b3..8787ae5cf9 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %s", opts); + result = sscanf(pos, " - %*s %*s %1023s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index beea171f5d..bfdca51d93 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index d2049ebaa2..ae4827de97 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 35ef41a21b..228e668925 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 11d224fa70..7e0d2654e1 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index 1e6ee1dea5..f268f2fece 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%s", val); + ret = fscanf(fp, "%200s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index 68b6f22f52..a68fa38eeb 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); From d6860d06122855623038563092cbbb3081c763b0 Mon Sep 17 00:00:00 2001 From: Suraj Shirvankar Date: Wed, 12 Apr 2023 13:38:06 +0000 Subject: [PATCH 007/321] sk-inet: Add IP TOS socket option The TOS(type of service) field in the ip header allows you specify the priority of the socket data. Signed-off-by: Suraj Shirvankar --- criu/sk-inet.c | 4 ++++ images/sk-inet.proto | 1 + 2 files changed, 5 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 4bd5abff17..24e92a8521 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -416,9 +416,11 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io } else { ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + ret |= dump_opt(sk, SOL_IP, IP_TOS, &ioe->tos); } ioe->has_freebind = ioe->freebind; ioe->has_pktinfo = !!ioe->pktinfo; + ioe->has_tos = !!ioe->tos; return ret; } @@ -813,6 +815,8 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); if (ioe->has_pktinfo) ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + if (ioe->has_tos) + ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); } if (ioe->raw) diff --git a/images/sk-inet.proto b/images/sk-inet.proto index ee1f0ae410..666326fa40 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -19,6 +19,7 @@ message ip_opts_entry { optional ip_opts_raw_entry raw = 4; optional bool pktinfo = 5; + optional uint32 tos = 6; } message inet_sk_entry { From 0dd4668aaf28206fb08ed98c0711a7bee7a2ce3d Mon Sep 17 00:00:00 2001 From: Suraj Shirvankar Date: Thu, 13 Apr 2023 22:27:11 +0200 Subject: [PATCH 008/321] zdtm: Add tests for ip tos restore Signed-off-by: Suraj Shirvankar --- test/zdtm/static/sock_ip_opts00.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c index 08970c0daf..d890410d89 100644 --- a/test/zdtm/static/sock_ip_opts00.c +++ b/test/zdtm/static/sock_ip_opts00.c @@ -3,6 +3,7 @@ #include #include +#include #include #include "zdtmtst.h" @@ -19,11 +20,13 @@ const char *test_author = "Pavel Tikhomirov "; struct sk_opt { int level; int opt; + int val; }; struct sk_opt sk_opts_v4[] = { - { SOL_IP, IP_FREEBIND }, - { SOL_IP, IP_PKTINFO }, + { SOL_IP, IP_FREEBIND, IP_OPT_VAL }, + { SOL_IP, IP_PKTINFO, IP_OPT_VAL }, + { SOL_IP, IP_TOS, IPTOS_TOS(IPTOS_THROUGHPUT) }, }; #ifndef IPV6_FREEBIND @@ -31,8 +34,8 @@ struct sk_opt sk_opts_v4[] = { #endif struct sk_opt sk_opts_v6[] = { - { SOL_IPV6, IPV6_FREEBIND }, - { SOL_IPV6, IPV6_RECVPKTINFO }, + { SOL_IPV6, IPV6_FREEBIND, IP_OPT_VAL }, + { SOL_IPV6, IPV6_RECVPKTINFO, IP_OPT_VAL }, }; struct sk_conf { @@ -71,7 +74,7 @@ int main(int argc, char **argv) n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); for (j = 0; j < n_opts; j++) { - val = IP_OPT_VAL; + val = opts[j].val; if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); goto close; @@ -93,7 +96,7 @@ int main(int argc, char **argv) goto close; } - if (val != IP_OPT_VAL) { + if (val != opts[j].val) { fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); goto close; From 13eb87606e47b5d5a6813fe42b632b7861f7343c Mon Sep 17 00:00:00 2001 From: hdzhoujie Date: Tue, 18 Apr 2023 21:03:53 +0800 Subject: [PATCH 009/321] dump: increase fcntl call failure judgment The pipe_size type is unsigned int, when the fcntl call fails and return -1, it will cause a negative rollover problem. Signed-off-by: zhoujie --- criu/page-pipe.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/criu/page-pipe.c b/criu/page-pipe.c index 54dc3ccc41..aab6742be7 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -99,6 +99,7 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl { struct page_pipe_buf *prev = pp_prev_ppb(pp, ppb_flags); struct page_pipe_buf *ppb; + int ppb_size = 0; ppb = xmalloc(sizeof(*ppb)); if (!ppb) @@ -120,7 +121,13 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl cnt_add(CNT_PAGE_PIPES, 1); ppb->pipe_off = 0; - ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE; + ppb_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0); + if (ppb_size < 0) { + xfree(ppb); + pr_perror("Can't get pipe size"); + return NULL; + } + ppb->pipe_size = ppb_size / PAGE_SIZE; pp->nr_pipes++; } From 970c4abe7b6df0ef691e476e3964480dee5508fe Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 24 Apr 2023 09:28:19 +0200 Subject: [PATCH 010/321] compel: support XSAVE on newer Intel CPUs Newer Intel CPUs (Sapphire Rapids) have a much larger xsave area than before. Looking at older CPUs I see 2440 bytes. # cpuid -1 -l 0xd -s 0 ... bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) On newer CPUs (Sapphire Rapids) it grows to 11008 bytes. # cpuid -1 -l 0xd -s 0 ... bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) This increase the xsave area from one page to four pages. Without this patch the fpu03 test fails, with this patch it works again. Signed-off-by: Adrian Reber --- .../arch/x86/src/lib/include/uapi/asm/fpu.h | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index bd3b0cbd5c..8c83dd9ae4 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -21,7 +21,28 @@ #define XSTATE_YMM 0x4 #define FXSAVE_SIZE 512 -#define XSAVE_SIZE 4096 +/* + * This used to be 4096 (one page). There is a comment below concerning + * this size: + * "One page should be enough for the whole xsave state ;-)" + * Which is kind of funny as it is no longer enough ;-) + * + * Older CPUs: + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) + * + * Newer CPUs (Sapphire Rapids): + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) + * + * So one page is no longer enough... But: + * + * Four pages should be enough for the whole xsave state ;-) + */ + +#define XSAVE_SIZE 4*4096 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE @@ -235,8 +256,11 @@ struct pkru_state { * * * One page should be enough for the whole xsave state ;-) + * + * Of course it was not ;-) Now using four pages... + * */ -#define EXTENDED_STATE_AREA_SIZE (4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) +#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) /* * cpu requires it to be 64 byte aligned From 1d4c5ed74de3180eaa1b9527e99cbe6dbb9dcbc5 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 24 Apr 2023 13:53:41 +0000 Subject: [PATCH 011/321] ci: fix new codespell errors Signed-off-by: Adrian Reber --- .codespellrc | 2 +- compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h | 2 +- compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h | 2 +- criu/include/image.h | 2 +- criu/mem.c | 2 +- criu/mount.c | 2 +- criu/namespaces.c | 2 +- criu/net.c | 2 +- criu/pie/restorer.c | 2 +- flog/include/uapi/flog.h | 2 +- include/common/scm.h | 2 +- plugins/amdgpu/README.md | 2 +- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- test/exhaustive/unix.py | 2 +- test/others/app-emu/java/HelloWorld/run.sh | 2 +- test/others/app-emu/make/run.sh | 2 +- test/zdtm/static/child_opened_proc.c | 2 +- test/zdtm/static/maps00.c | 2 +- test/zdtm/static/mntns_root_bind.c | 2 +- test/zdtm/static/stopped.c | 2 +- test/zdtm/transition/ipc.c | 4 ++-- test/zdtm/transition/lazy-thp.c | 2 +- 22 files changed, 23 insertions(+), 23 deletions(-) diff --git a/.codespellrc b/.codespellrc index 765dacfabb..dd31dd851c 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] skip = ./.git,./test/pki -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index f8ec55d6c0..9152024fd8 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -18,7 +18,7 @@ struct aux_context { struct _aarch64_ctx end; }; -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index 8cc94ba740..0c4ccb6486 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -14,7 +14,7 @@ */ #include -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include diff --git a/criu/include/image.h b/criu/include/image.h index 5cb01bde20..9a275565f9 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -41,7 +41,7 @@ * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar - * the vDSO area, it might reqire additional memory + * the vDSO area, it might require additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap diff --git a/criu/mem.c b/criu/mem.c index ab86a1f6d7..9bf7cae971 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -161,7 +161,7 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but - * the memory contents is present in the pagent image set. + * the memory contents is present in the parent image set. */ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, diff --git a/criu/mount.c b/criu/mount.c index db9db63b27..c26aaa58da 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -2823,7 +2823,7 @@ static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { - struct mount_info *mi; /* child is remaped into the root yards */ + struct mount_info *mi; /* child is remapped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; diff --git a/criu/namespaces.c b/criu/namespaces.c index b1b5303fa5..b7c0ab4008 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1454,7 +1454,7 @@ int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) * each other easily. Stream socket require manual * messages boundaries. * - * b) Make callers note the damon death by seeing the + * b) Make callers note the daemon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. diff --git a/criu/net.c b/criu/net.c index 2793b18e66..84250598c0 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3433,7 +3433,7 @@ struct ns_id *net_get_root_ns(void) /* * socket_diag doesn't report unbound and unconnected sockets, - * so we have to get their network namesapces explicitly + * so we have to get their network namespaces explicitly */ struct ns_id *get_socket_ns(int lfd) { diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 5e78e74d4f..9873fdc11a 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1068,7 +1068,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) * |G|----tgt----| | * * 3. remap src to any other place. - * G prevents src from being remaped on tgt again + * G prevents src from being remapped on tgt again * | |-------------| -> |+++++src+++++| * |G|---tgt-----| | * diff --git a/flog/include/uapi/flog.h b/flog/include/uapi/flog.h index 6061f4556a..5fb71b1053 100644 --- a/flog/include/uapi/flog.h +++ b/flog/include/uapi/flog.h @@ -86,7 +86,7 @@ /* double: 13, */ \ /* long double: 14, */ \ \ - /* Basic poniters */ \ + /* Basic pointers */ \ char *: (1u << (ord - n - 1)), \ signed char *: (1u << (ord - n - 1)), \ unsigned char *: (1u << (ord - n - 1)), \ diff --git a/include/common/scm.h b/include/common/scm.h index bcb198882b..5b6f78a8bd 100644 --- a/include/common/scm.h +++ b/include/common/scm.h @@ -11,7 +11,7 @@ * Because of kernel doing kmalloc for user data passed * in SCM messages, and there is kernel's SCM_MAX_FD as a limit * for descriptors passed at once we're trying to reduce - * the pressue on kernel memory manager and use predefined + * the pressure on kernel memory manager and use predefined * known to work well size of the message buffer. */ #define CR_SCM_MSG_SIZE (1024) diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 6809ec8b9a..1078eafe6f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -263,7 +263,7 @@ ROCm | Radeon Open Compute Platform Thunk | User-mode API interface to interact with amdgpu.ko KFD | AMD Kernel Fusion Driver Mesa | Open source OpenGL implementation -GTT | Graphis Translation Table, also used to denote kernel-managed system memory for GPU access +GTT | Graphics Translation Table, also used to denote kernel-managed system memory for GPU access VRAM | Video RAM BO | Buffer Object HMM | Heterogeneous Memory Management diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 42689933ee..6d004247be 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -1241,7 +1241,7 @@ static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, s return true; } else { /* We could not map remaining nodes in the list. Add dest node back - * to list and try to map next dest ndoe in list to current src + * to list and try to map next dest node in list to current src * node. */ pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, " diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 5b4c972cb6..6f72dd44b7 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -462,7 +462,7 @@ def set_nonblock(sk): def chk_real_state(st): - # Before enything else -- check that we still have + # Before anything else -- check that we still have # all the sockets at hands for sk in st.sockets: if not sk.visible: diff --git a/test/others/app-emu/java/HelloWorld/run.sh b/test/others/app-emu/java/HelloWorld/run.sh index 0ed6afd141..e6dcbd9fca 100644 --- a/test/others/app-emu/java/HelloWorld/run.sh +++ b/test/others/app-emu/java/HelloWorld/run.sh @@ -18,7 +18,7 @@ setsid java HelloWorld & pid=${!} -echo Lanuched java application with pid $pid in background +echo Launched java application with pid $pid in background ${criu} dump -D dump -o dump.log -v4 --shell-job -t ${pid} || { echo "Dump failed" diff --git a/test/others/app-emu/make/run.sh b/test/others/app-emu/make/run.sh index 7cb44c7709..d871b7d9c4 100644 --- a/test/others/app-emu/make/run.sh +++ b/test/others/app-emu/make/run.sh @@ -28,7 +28,7 @@ setsid make -j4 & pid=${!} -echo Lanuched make in $pid background +echo Launched make in $pid background sleep 2 ${criu} dump --shell-job -D dump -o dump.log -v4 -t ${pid} || { diff --git a/test/zdtm/static/child_opened_proc.c b/test/zdtm/static/child_opened_proc.c index 2125cd264e..cfe04fa4be 100644 --- a/test/zdtm/static/child_opened_proc.c +++ b/test/zdtm/static/child_opened_proc.c @@ -10,7 +10,7 @@ #include "zdtmtst.h" const char *test_doc = "Check that tree prior to files opening"; -const char *test_author = "Stanislav Kinsbursky Date: Mon, 24 Apr 2023 07:49:57 +0000 Subject: [PATCH 012/321] scripts: make newer versions of shellcheck happy Signed-off-by: Adrian Reber --- scripts/install-debian-pkgs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh index 540c2c0949..8be49c7871 100755 --- a/scripts/install-debian-pkgs.sh +++ b/scripts/install-debian-pkgs.sh @@ -15,7 +15,7 @@ function print_help() function process() { sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' ${REQ_PKGS} )" + sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" } if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then From 8e0697d5643a370ffb1c2fcb18c4a7afa47440b6 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 25 Apr 2023 12:40:12 +0800 Subject: [PATCH 013/321] criu-ns: make --pidfile option show pid in caller pidns Using the fact that we know criu_pid and criu is a parent of restored process we can create pidfile with pid on caller pidns level. We need to move mount namespace creation to child so that criu-ns can see caller pidns proc. Signed-off-by: Pavel Tikhomirov --- scripts/criu-ns | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index d51e7772c0..0f83ca336d 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -6,6 +6,7 @@ import sys import os import fcntl import termios +import time # constants for unshare CLONE_NEWNS = 0x00020000 @@ -110,8 +111,8 @@ def wrap_restore(): if '--restore-sibling' in restore_args: raise OSError(errno.EINVAL, "--restore-sibling is not supported") - # Unshare pid and mount namespaces - if _unshare(CLONE_NEWNS | CLONE_NEWPID) != 0: + # Unshare pid namespace + if _unshare(CLONE_NEWPID) != 0: _errno = ctypes.get_errno() raise OSError(_errno, errno.errorcode[_errno]) @@ -123,8 +124,32 @@ def wrap_restore(): restore_detached = True restore_args.remove('--restore-detached') + restore_pidfile = None + if '--pidfile' in restore_args: + try: + opt_index = restore_args.index('--pidfile') + restore_pidfile = restore_args[opt_index + 1] + del restore_args[opt_index:opt_index + 2] + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--pidfile missing argument") + + if not restore_pidfile.startswith('/'): + for base_dir_opt in ['--work-dir', '-W', '--images-dir', '-D']: + if base_dir_opt in restore_args: + try: + opt_index = restore_args.index(base_dir_opt) + restore_pidfile = os.path.join(restore_args[opt_index + 1], restore_pidfile) + break + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, base_dir_opt + " missing argument") + criu_pid = os.fork() if criu_pid == 0: + # Unshare mount namespace + if _unshare(CLONE_NEWNS) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + os.setsid() # Set stdin tty to be a controlling tty of our new session, this is # required by --shell-job option, as for it CRIU would try to set a @@ -139,6 +164,25 @@ def wrap_restore(): _mount_new_proc() run_criu(restore_args) + if restore_pidfile: + restored_pid = None + retry = 5 + + while not restored_pid and retry: + with open('/proc/%d/task/%d/children' % (criu_pid, criu_pid)) as f: + line = f.readline().strip() + if len(line): + restored_pid = line + break + retry -= 1 + time.sleep(1) + + if restored_pid: + with open(restore_pidfile, 'w+') as f: + f.write(restored_pid) + else: + print("Warn: Search of restored pid for --pidfile option timeouted") + if restore_detached: return 0 From 806ee350159b6d8a50b59ae87bbd295377233134 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 17 May 2023 09:06:15 +0100 Subject: [PATCH 014/321] docs: rename amdgpu_plugin.txt to criu-amdgpu-plugin.txt By default, the file name 'amdgpu_plugin.txt' is used also as the name for the corresponding man page (`man amdgpu_plugin`). However, when this man page is installed system-wide it would be more appropriate to have a prefix 'criu-' (e.g., `man criu-amdgpu-plugin`). Signed-off-by: Radostin Stoyanov --- Documentation/Makefile | 2 +- Documentation/{amdgpu_plugin.txt => criu-amdgpu-plugin.txt} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename Documentation/{amdgpu_plugin.txt => criu-amdgpu-plugin.txt} (94%) diff --git a/Documentation/Makefile b/Documentation/Makefile index 5085514501..72bf0e8623 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -16,7 +16,7 @@ ifeq ($(PYTHON),python3) SRC1 += criu-ns.txt endif SRC1 += compel.txt -SRC1 += amdgpu_plugin.txt +SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) diff --git a/Documentation/amdgpu_plugin.txt b/Documentation/criu-amdgpu-plugin.txt similarity index 94% rename from Documentation/amdgpu_plugin.txt rename to Documentation/criu-amdgpu-plugin.txt index 0d490b4292..48a8e2f6d1 100644 --- a/Documentation/amdgpu_plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -3,7 +3,7 @@ ROCM Support(1) NAME ---- -amdgpu_plugin - A plugin extension to CRIU to support checkpoint/restore in +criu-amdgpu-plugin - A plugin extension to CRIU to support checkpoint/restore in userspace for AMD GPUs. @@ -22,7 +22,7 @@ Though *criu* is a great tool for checkpointing and restoring running applications, it has certain limitations such as it cannot handle applications that have device files open. In order to support *ROCm* based workloads with *criu* we need to augment criu's core functionality with a -plugin based extension mechanism. *amdgpu_plugin* provides the necessary support +plugin based extension mechanism. *criu-amdgpu-plugin* provides the necessary support to criu to allow Checkpoint / Restore with ROCm. From f57bda46a3b371bdb375de5639c4b0de92e75dec Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 11 May 2023 16:18:31 +0000 Subject: [PATCH 015/321] lib/c: add empty_ns interfaces to libcriu crun wants to set empty_ns and this interface is missing from the library. This adds it to libcriu. Signed-off-by: Adrian Reber --- lib/c/criu.c | 11 +++++++++++ lib/c/criu.h | 3 +++ 2 files changed, 14 insertions(+) diff --git a/lib/c/criu.c b/lib/c/criu.c index fc8159999c..0095bcc9bc 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -2030,3 +2030,14 @@ int criu_feature_check(struct criu_feature_check *features, size_t size) { return criu_local_feature_check(global_opts, features, size); } + +void criu_local_set_empty_ns(criu_opts *opts, int namespaces) +{ + opts->rpc->has_empty_ns = true; + opts->rpc->empty_ns = namespaces; +} + +void criu_set_empty_ns(int namespaces) +{ + criu_local_set_empty_ns(global_opts, namespaces); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index 28a083d88d..3b9cedfd09 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -322,6 +322,9 @@ struct criu_feature_check { int criu_feature_check(struct criu_feature_check *features, size_t size); int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size); +void criu_local_set_empty_ns(criu_opts *opts, int namespaces); +void criu_set_empty_ns(int namespaces); + #ifdef __GNUG__ } #endif From f3082720622e28cfea1415a6bb285a73546db7dc Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Tue, 4 Apr 2023 13:54:11 +0530 Subject: [PATCH 016/321] criu-ns: Add --criu-binary argument to run_criu() --criu-binary argument provides a way to supply the CRIU binary location to run_criu(). Related to: #1909 Signed-off-by: Dhanuka Warusadura --- scripts/criu-ns | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index 0f83ca336d..d4d867b660 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -81,8 +81,21 @@ def run_criu(args): Spawn CRIU binary """ print(sys.argv) - os.execlp('criu', *['criu'] + args) - raise OSError(errno.ENOENT, "No such command") + + if "--criu-binary" in args: + try: + opt_index = args.index("--criu-binary") + path = args[opt_index + 1] + del args[opt_index:opt_index + 2] + args.insert(0, "criu") + os.execv(path, args) + raise OSError(errno.ENOENT, "No such command") + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--criu-binary missing argument") + else: + args.insert(0, "criu") + os.execvp("criu", args) + raise OSError(errno.ENOENT, "No such command") # pidns_holder creates a process that is reparented to the init. From 38db5e1f2fa613733e0902599bcdf6ea30d98086 Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Mon, 17 Apr 2023 13:00:39 +0530 Subject: [PATCH 017/321] criu-ns: Add support for older Python version in CI These changes remove and update the changes introduced in 7177938e60b81752a44a8116b3e7e399c24c4fcb in favor of the Python version in CI. os.waitstatus_to_exitcode() function appeared in Python 3.9 Related to: #1909 Signed-off-by: Dhanuka Warusadura --- scripts/criu-ns | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index d4d867b660..4c032aa140 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -71,7 +71,19 @@ def _wait_for_process_status(criu_pid): try: (pid, status) = os.wait() if pid == criu_pid: - return os.waitstatus_to_exitcode(status) + # The following code block is based on + # os.waitstatus_to_exitcode() introduced in Python 3.9 + # and we implement this for comparability with older + # versions of Python. + if os.WIFSIGNALED(status): + return os.WTERMSIG(status) + elif os.WIFEXITED(status): + return os.WEXITSTATUS(status) + elif os.WIFSTOPPED(status): + return os.WSTOPSIG(status) + else: + raise Exception("CRIU was terminated by an " + "unidentified reason") except OSError: return -251 From 8094df8ddb9644f715212bc18226e94a1560bbff Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Wed, 8 Mar 2023 18:19:17 +0530 Subject: [PATCH 018/321] criu-ns: Add tests for criu-ns script These changes add test implementations for criu-ns script. Fixes: #1909 Signed-off-by: Dhanuka Warusadura --- Makefile | 1 + scripts/ci/run-ci-tests.sh | 1 + test/Makefile | 2 +- test/others/criu-ns/Makefile | 3 + test/others/criu-ns/run.py | 258 +++++++++++++++++++++++++++++++++++ 5 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 test/others/criu-ns/Makefile create mode 100755 test/others/criu-ns/run.py diff --git a/Makefile b/Makefile index 377c6a3b5f..23f68e2f31 100644 --- a/Makefile +++ b/Makefile @@ -438,6 +438,7 @@ lint: flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py flake8 --config=scripts/flake8.cfg lib/py/images/images.py flake8 --config=scripts/flake8.cfg scripts/criu-ns + flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py flake8 --config=scripts/flake8.cfg crit/setup.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py flake8 --config=scripts/flake8.cfg coredump/ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 229de97c1c..b45183a847 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -260,6 +260,7 @@ if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi +make -C test/others/criu-ns/ run make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run diff --git a/test/Makefile b/test/Makefile index e8fcffe3fc..5784b6a495 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,7 +12,7 @@ all: $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job skip-file-rwx-check +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job criu-ns skip-file-rwx-check other: for t in $(TESTS); do \ diff --git a/test/others/criu-ns/Makefile b/test/others/criu-ns/Makefile new file mode 100644 index 0000000000..4d901a1116 --- /dev/null +++ b/test/others/criu-ns/Makefile @@ -0,0 +1,3 @@ +run: + @make -C ../.. zdtm_ct + ../../zdtm_ct run.py diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py new file mode 100755 index 0000000000..6967b46b29 --- /dev/null +++ b/test/others/criu-ns/run.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python + +import fcntl +import os +import pathlib +import pty +import shutil +import subprocess +import sys +import termios +import time + + +CRIU_BIN = "../../../criu/criu" +CRIU_NS = "../../../scripts/criu-ns" +IMG_DIR = "dumpdir" +DUMP_LOG = "dump.log" +RESTORE_LOG = "restore.log" +PIDFILE = "pidfile" + + +def check_dumpdir(path=IMG_DIR): + if os.path.isdir(path): + shutil.rmtree(path) + os.mkdir(path, 0o755) + + +def set_blocking(fd, blocking): + """Implement os.set_blocking() for compatibility with Python + versions earlier than 3.5""" + flags = fcntl.fcntl(fd, fcntl.F_GETFL) + + if blocking: + flags &= ~os.O_NONBLOCK + else: + flags |= os.O_NONBLOCK + + fcntl.fcntl(fd, fcntl.F_SETFL, flags) + + +def run_task_with_own_pty(task): + fd_m, fd_s = pty.openpty() + + pid = os.fork() + if pid == 0: + os.close(fd_m) + os.setsid() + os.dup2(fd_s, 0) + os.dup2(fd_s, 1) + os.dup2(fd_s, 2) + fcntl.ioctl(fd_s, termios.TIOCSCTTY, 1) + os.close(fd_s) + task() + exit(0) + + os.close(fd_s) + fd_m = os.fdopen(fd_m, "rb") + set_blocking(fd_m.fileno(), False) + + while True: + try: + data = fd_m.read() + except IOError: + break + if data is not None: + print(data.decode("utf-8")) + + _, status = os.waitpid(pid, 0) + + try: + data = fd_m.read() + except IOError as err: + print(err) + + if data is not None: + print(data.decode("utf-8")) + fd_m.close() + + if status != 0: + print("task %s exited badly: %d" % (task.__name__, status)) + exit(1) + + return 0 + + +def create_pty(): + fd_m, fd_s = pty.openpty() + return (os.fdopen(fd_m, "wb"), os.fdopen(fd_s, "wb")) + + +def create_isolated_dumpee(): + pathlib.Path("running").touch() + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + os.dup2(fd_s.fileno(), 0) + os.dup2(fd_s.fileno(), 1) + os.dup2(fd_s.fileno(), 2) + fcntl.ioctl(fd_s.fileno(), termios.TIOCSCTTY, 1) + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + fd_m.close() + fd_s.close() + return pid + + +def criu_ns_dump(pid, shell_job=False): + cmd = [CRIU_NS, "dump", "-D", IMG_DIR, "-v4", "-t", str(pid), + "--log-file", DUMP_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + ret = subprocess.Popen(cmd).wait() + return ret + + +def criu_ns_restore(shell_job=False, restore_detached=False): + cmd = [CRIU_NS, "restore", "-D", IMG_DIR, "-v4", "--log-file", + RESTORE_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + if restore_detached: + cmd += ["--restore-detached", "--pidfile", PIDFILE] + ret = subprocess.Popen(cmd).wait() + return ret + + +def read_log_file(filename): + logfile_path = os.path.join(IMG_DIR, filename) + with open(logfile_path) as logfile: + print(logfile.read()) + + +def test_dump_and_restore_with_shell_job(): + print("Test criu-ns dump and restore with --shell-job option") + check_dumpdir() + pathlib.Path("running").touch() + pid = os.fork() + if pid == 0: + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + + ret = criu_ns_dump(pid, shell_job=True) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + os.unlink("running") + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + fd_m.close() + # since criu-ns takes control of the tty stdin + os.dup2(fd_s.fileno(), 0) + ret = criu_ns_restore(shell_job=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + fd_s.close() + os.waitpid(pid, 0) + + +def test_dump_and_restore_without_shell_job(restore_detached=False): + print("Test criu-ns dump and restore with an isolated process" + "(%d)" % restore_detached) + check_dumpdir() + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + if not restore_detached: + os.unlink("running") + + pid = os.fork() + if pid == 0: + os.setsid() + ret = criu_ns_restore(restore_detached=restore_detached) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + os.waitpid(pid, 0) + + +def test_dump_and_restore_in_pidns(): + if os.system("grep NSpid /proc/self/status"): + return + + print("Test criu-ns dump and restore in namespaces") + + def _dump(): + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _restore(): + ret = criu_ns_restore(restore_detached=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + def _get_restored_pid(): + restored_pid = 0 + pidfile_path = os.path.join(IMG_DIR, PIDFILE) + if not os.path.exists(pidfile_path): + raise FileNotFoundError("pidfile not found") + with open(pidfile_path, "r") as pidfile: + restored_pid = pidfile.read().strip() + return int(restored_pid) + + def _redump(): + global IMG_DIR + try: + restored_pid = _get_restored_pid() + except FileNotFoundError: + sys.exit(1) + IMG_DIR = "dumpdir2" + check_dumpdir(IMG_DIR) + ret = criu_ns_dump(restored_pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _re_restore(): + os.unlink("running") + ret = criu_ns_restore() + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + check_dumpdir() + _dump() + _restore() + _redump() + _re_restore() + + +def main(): + test_dump_and_restore_with_shell_job() + test_dump_and_restore_without_shell_job() + test_dump_and_restore_without_shell_job(restore_detached=True) + test_dump_and_restore_in_pidns() + + +if __name__ == "__main__": + run_task_with_own_pty(main) From f0e93585903cbf3450563f97fc47a9cdc01c709f Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Mon, 22 May 2023 14:51:13 +0530 Subject: [PATCH 019/321] criu-ns: Install Python pathlib module in CentOS 7 These changes fix the `ImportError: No module named pathlib` error when executing criu-ns tests located at criu/test/others/criu-ns Signed-off-by: Dhanuka Warusadura --- .cirrus.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index bd4799fd0b..80f3296fce 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -134,11 +134,11 @@ task: memory: 8G setup_script: | - # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. + # EPEL is needed for python2-future, python2-junit_xml, python-pathlib, python-flake8 and libbsd-devel. # Do not fail if latest epel repository definition is already installed yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel + yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six python-pathlib sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel # Even with selinux in permissive mode the selinux tests will be executed # The Cirrus CI user runs as a service from selinux point of view and is # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) From 9130fefa4dd4e0c3029ab062fefd796a863a8880 Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Mon, 22 May 2023 15:06:14 +0530 Subject: [PATCH 020/321] criu-ns: Update shebang line to python CentOS 7 CI environment uses Python 2. To execute criu-ns script in CentOS 7 changing the current shebang line to python is required. This reverse the changes made in a15a63fce0ad4d1a9119771577fa7ef562bbfd6b Signed-off-by: Dhanuka Warusadura --- scripts/criu-ns | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index 4c032aa140..3c77b8eb49 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import ctypes import ctypes.util import errno From 358f09cf48a66e0f643c787efadacdea5b921361 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 May 2023 22:59:28 +0800 Subject: [PATCH 021/321] timers: improve and fix posix timer id sequence checks This is a patch proposed by Thomas here: https://lore.kernel.org/all/87ilczc7d9.ffs@tglx/ It removes (created id > desired id) "sanity" check and adds proper checking that ids start at zero and increment by one each time when we create/delete a posix timer. First purpose of it is to fix infinite looping in create_posix_timers on old pre 3.11 kernels. Second purpose is to allow kernel interface of creating posix timers with desired id change from iterating with predictable next id to just setting next id directly. And at the same time removing predictable next id so that criu with this patch would not get to infinite loop in create_posix_timers if this happens. Thanks a lot to Thomas! Signed-off-by: Pavel Tikhomirov --- criu/pie/restorer.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9873fdc11a..1f08bc2a08 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1169,7 +1169,7 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { int ret, i; - kernel_timer_t next_id; + kernel_timer_t next_id = 0, timer_id; struct sigevent sev; for (i = 0; i < args->posix_timers_n; i++) { @@ -1183,25 +1183,26 @@ static int create_posix_timers(struct task_restore_args *args) sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; while (1) { - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); return ret; } - if (next_id == args->posix_timers[i].spt.it_id) + if (timer_id != next_id) { + pr_err("Can't create timers, kernel don't give them consequently\n"); + return -1; + } + next_id++; + + if (timer_id == args->posix_timers[i].spt.it_id) break; - ret = sys_timer_delete(next_id); + ret = sys_timer_delete(timer_id); if (ret < 0) { - pr_err("Can't remove temporaty posix timer 0x%x\n", next_id); + pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); return ret; } - - if ((long)next_id > args->posix_timers[i].spt.it_id) { - pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; - } } } From 104a82856f83bfa24fb08811778f7f967d1d12ae Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 2 Apr 2023 15:45:05 +0100 Subject: [PATCH 022/321] action-scripts: Add pre-stream hook This hook allows to start image streamer process from an action script. Signed-off-by: Radostin Stoyanov --- criu/action-scripts.c | 1 + criu/img-streamer.c | 8 ++++++++ criu/include/action-scripts.h | 1 + 3 files changed, 10 insertions(+) diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 1ce6d9c108..ec0563e162 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -18,6 +18,7 @@ #include "common/scm.h" static const char *action_names[ACT_MAX] = { + [ACT_PRE_STREAM] = "pre-stream", [ACT_PRE_DUMP] = "pre-dump", [ACT_POST_DUMP] = "post-dump", [ACT_PRE_RESTORE] = "pre-restore", diff --git a/criu/img-streamer.c b/criu/img-streamer.c index 7e36eae012..305e6fae5e 100644 --- a/criu/img-streamer.c +++ b/criu/img-streamer.c @@ -12,6 +12,7 @@ #include "rst-malloc.h" #include "common/scm.h" #include "common/lock.h" +#include "action-scripts.h" /* * We use different path names for the dump and restore sockets because: @@ -49,10 +50,17 @@ static const char *socket_name_for_mode(int mode) int img_streamer_init(const char *image_dir, int mode) { struct sockaddr_un addr; + int pre_stream_ret; int sockfd; img_streamer_mode = mode; + pre_stream_ret = run_scripts(ACT_PRE_STREAM); + if (pre_stream_ret != 0) { + pr_err("Pre-stream script failed with %d!\n", pre_stream_ret); + return -1; + } + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { pr_perror("Unable to instantiate UNIX socket"); diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index c2e8850aab..793698c27c 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -4,6 +4,7 @@ #include "asm/int.h" enum script_actions { + ACT_PRE_STREAM, ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, From 4d137b81a0b6e4c5dc5bb093d2cbbc6ea04cb486 Mon Sep 17 00:00:00 2001 From: Valeriy Vdovin Date: Tue, 9 Feb 2021 16:55:48 +0300 Subject: [PATCH 023/321] cgroup/restore: split prepare_task_cgroup code into two separate functions This does cgroup namespace creation separately from joining task cgroups. This makes the code more logical, because creating cgroup namespace also involves joining cgroups but these cgroups can be different to task's cgroups as they are cgroup namespace roots (cgns_prefix), and mixing all of them together may lead to misunderstanding. Another positive thing is that we consolidate !item->parent checks in one place in restore_task_with_children. Signed-off-by: Valeriy Vdovin Signed-off-by: Pavel Tikhomirov --- criu/cgroup.c | 54 ++++++++++++++++++++++++++++++++----------- criu/cr-restore.c | 9 +++++++- criu/include/cgroup.h | 3 ++- 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 8243ac6d3c..bcb7b405a3 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1202,17 +1202,12 @@ static int prepare_cgns(CgSetEntry *se) return 0; } -static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) +static int move_in_cgroup(CgSetEntry *se) { int i; pr_info("Move into %d\n", se->id); - if (setup_cgns && prepare_cgns(se) < 0) { - pr_err("failed preparing cgns\n"); - return -1; - } - for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; @@ -1252,7 +1247,44 @@ static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) return 0; } -int prepare_task_cgroup(struct pstree_item *me) +int prepare_cgroup_namespace(struct pstree_item *root_task) +{ + CgSetEntry *se; + + if (opts.manage_cgroups == CG_MODE_IGNORE) + return 0; + + if (root_task->parent) { + pr_err("Expecting root_task to restore cgroup namespace\n"); + return -1; + } + + /* + * If on dump all dumped tasks are in same cgset with criu we don't + * dump cgsets and thus cgroup namespaces and rely that on restore + * criu caller would prepare proper cgset/cgns for us. Also in case + * of --unprivileged we don't even have the root cgset here. + */ + if (!rsti(root_task)->cg_set || rsti(root_task)->cg_set == root_cg_set) { + pr_info("Cgroup namespace inherited from parent\n"); + return 0; + } + + se = find_rst_set_by_id(rsti(root_task)->cg_set); + if (!se) { + pr_err("No set %d found\n", rsti(root_task)->cg_set); + return -1; + } + + if (prepare_cgns(se) < 0) { + pr_err("failed preparing cgns\n"); + return -1; + } + + return 0; +} + +int restore_task_cgroup(struct pstree_item *me) { struct pstree_item *parent = me->parent; CgSetEntry *se; @@ -1284,13 +1316,7 @@ int prepare_task_cgroup(struct pstree_item *me) return -1; } - /* Since don't support nesting of cgroup namespaces, let's only set up - * the cgns (if it exists) in the init task. In the future, we should - * just check that the cgns prefix string matches for all the entries - * in the cgset, and only unshare if that's true. - */ - - return move_in_cgroup(se, !me->parent); + return move_in_cgroup(se); } void fini_cgroup(void) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index f02e95f6d2..2b99a775d8 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1827,6 +1827,13 @@ static int restore_task_with_children(void *_arg) /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; + + /* + * Since we don't support nesting of cgroup namespaces, let's + * only set up the cgns (if it exists) in the init task. + */ + if (prepare_cgroup_namespace(current) < 0) + goto err; } if (needs_prep_creds(current) && (prepare_userns_creds())) @@ -1838,7 +1845,7 @@ static int restore_task_with_children(void *_arg) * we will only move the root one there, others will * just have it inherited. */ - if (prepare_task_cgroup(current) < 0) + if (restore_task_cgroup(current) < 0) goto err; /* Restore root task */ diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 93f61539cf..dc264032e8 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -9,7 +9,8 @@ struct parasite_dump_cgroup_args; extern u32 root_cg_set; int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); -int prepare_task_cgroup(struct pstree_item *); +int restore_task_cgroup(struct pstree_item *); +int prepare_cgroup_namespace(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); From af0e413e03f5ac22139a7f8a4d81db75a5b0fb86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 29 May 2023 19:28:19 +0200 Subject: [PATCH 024/321] Fix dumping hugetlb-based memfd on kernels < 4.16. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 4.15-based kernels don't allow F_*SEAL for memfds created with MFD_HUGETLB. Since seals are not possible in this case, fake F_GETSEALS result as if it was queried for a non-sealing-enabled memfd. Signed-off-by: Michał Mirosław --- criu/memfd.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/criu/memfd.c b/criu/memfd.c index da29377034..6a43dece60 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -93,8 +93,17 @@ static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char * } mie.seals = fcntl(fd, F_GET_SEALS); - if (mie.seals == -1) - goto out; + if (mie.seals == -1) { + if (errno != EINVAL || ~mie.hugetlb_flag & MFD_HUGETLB) { + pr_perror("fcntl(F_GET_SEALS)"); + goto out; + } + /* Kernels before 4.16 don't allow MFD_HUGETLB | + * MFD_ALLOW_SEALING and return EINVAL for + * fcntl(MFD_HUGETLB-enabled fd). + */ + mie.seals = F_SEAL_SEAL; + } if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_INODE), &mie, PB_MEMFD_INODE)) goto out; From 11288c968d57e001df32a17346c5d262a32d3958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 15:07:43 +0200 Subject: [PATCH 025/321] Fix mount(cgroup2) for older kernels. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux 4.15 doesn't like empty string for cgroup2 mount options. Pass NULL then to satisfy the kernel check. Log the options for easier debugging. Signed-off-by: Michał Mirosław --- criu/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index bcb7b405a3..0bf7b3818c 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -639,8 +639,8 @@ static int open_cgroupfs(struct cg_ctl *cc) return -1; } - if (mount("none", prefix, fstype, 0, mopts) < 0) { - pr_perror("Unable to mount %s", mopts); + if (mount("none", prefix, fstype, 0, mopts[0] ? mopts : NULL) < 0) { + pr_perror("Unable to mount %s %s", fstype, mopts); rmdir(prefix); return -1; } From 9e6454f50b0f06683ac5a0875535443efb498f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 2 Jun 2023 18:02:38 +0200 Subject: [PATCH 026/321] Restore THP_DISABLE prctl. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original commit added saving THP_DISABLED flag value, but missed restoring it. There is restoring code, but used only when --lazy_pages mode is enabled. Restore the prctl flag always. While at it, rename the `has_thp_enabled` -> `!thp_disabled` for consistency. Fixes: bbbd597b4124 (2017-06-28 "mem: add dump state of THP_DISABLED prctl") Signed-off-by: Michał Mirosław --- criu/cr-restore.c | 2 +- criu/include/restorer.h | 2 +- criu/include/rst_info.h | 2 -- criu/mem.c | 4 ---- criu/pie/restorer.c | 16 ++++++---------- 5 files changed, 8 insertions(+), 18 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 2b99a775d8..bff41dc565 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2971,7 +2971,7 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args) args->fd_exe_link = exe_fd; - args->has_thp_enabled = rsti(current)->has_thp_enabled; + args->thp_disabled = mm->has_thp_disabled && mm->thp_disabled; ret = 0; out: diff --git a/criu/include/restorer.h b/criu/include/restorer.h index bc0beb5cbb..e232f54040 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -144,7 +144,7 @@ struct task_restore_args { struct timeval logstart; int uffd; - bool has_thp_enabled; + bool thp_disabled; /* threads restoration */ int nr_threads; /* number of threads */ diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index d0a3db6c5d..704b42a727 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -73,8 +73,6 @@ struct rst_info { */ bool has_old_seccomp_filter; - bool has_thp_enabled; - struct rst_rseq *rseqe; void *breakpoint; diff --git a/criu/mem.c b/criu/mem.c index 9bf7cae971..417e0a21de 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1217,8 +1217,6 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { - MmEntry *mm = rsti(t)->mm; - /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until @@ -1241,8 +1239,6 @@ static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) pr_perror("Cannot disable THP"); return -1; } - if (!(mm->has_thp_disabled && mm->thp_disabled)) - rsti(t)->has_thp_enabled = true; return 0; } diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 1f08bc2a08..0d1360c52b 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1635,17 +1635,13 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } - if (args->uffd > -1) { - /* re-enable THP if we disabled it previously */ - if (args->has_thp_enabled) { - int ret; - ret = sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0); - if (ret) { - pr_err("Cannot re-enable THP: %d\n", ret); - goto core_restore_end; - } - } + ret = sys_prctl(PR_SET_THP_DISABLE, args->thp_disabled, 0, 0, 0); + if (ret) { + pr_err("Cannot restore THP_DISABLE=%d flag: %ld\n", args->thp_disabled, ret); + goto core_restore_end; + } + if (args->uffd > -1) { pr_debug("lazy-pages: closing uffd %d\n", args->uffd); /* * All userfaultfd configuration has finished at this point. From 7ca6856be45094b417519fd8cfcb4a1a5365dfc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 2 Jun 2023 17:22:06 +0200 Subject: [PATCH 027/321] Log if prctl(SET_THP_DISABLE) doesn't work as expected. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If prctl(SET_THP_DISABLE) is not used due to bad semantics, log it for easier debugging. Signed-off-by: Michał Mirosław --- criu/kerndat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/kerndat.c b/criu/kerndat.c index bc0c7ba05d..d38e8898ef 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1324,6 +1324,8 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); + if (!kdat.has_thp_disable) + pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE"); break; } } From d3a33ca1efcfd2cf45dd972cde6e0d350482a703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 17 May 2023 21:51:59 +0200 Subject: [PATCH 028/321] zdtm: thp_disable: Output a single failure message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While at it, don't carry over stale errno to the fail() message. Signed-off-by: Michał Mirosław --- test/zdtm/static/thp_disable.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index ab88120c2c..58d6039f8f 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -47,15 +47,14 @@ int main(int argc, char **argv) if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) return -1; + errno = 0; if (orig_flags != new_flags) { - pr_err("Flags are changed %lx -> %lx\n", orig_flags, new_flags); - fail(); + fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - pr_err("Madvs are changed %lx -> %lx\n", orig_madv, new_madv); - fail(); + fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); return -1; } From 6006cb6eaf034f5581a6f52a981ea64534fe32e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 2 Jun 2023 17:11:16 +0200 Subject: [PATCH 029/321] zdtm: thp_disable: Verify prctl(THP_DISABLE) migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- test/zdtm/static/thp_disable.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index 58d6039f8f..e385087788 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -17,6 +17,7 @@ int main(int argc, char **argv) unsigned long orig_flags = 0, new_flags = 0; unsigned long orig_madv = 0, new_madv = 0; void *area; + int ret; test_init(argc, argv); @@ -35,9 +36,31 @@ int main(int argc, char **argv) return -1; } + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); + return -1; + } + test_daemon(); test_waitsig(); + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting post-migration THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); + return -1; + } + if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { pr_perror("Enabling THP failed"); return -1; From c75c017e4c22bf8e5082a0cec1d8e9df76bac9dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 2 Jun 2023 19:01:29 +0200 Subject: [PATCH 030/321] zdtm: thp_disable: Verify MADV_NOHUGEPAGE before migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a sanity check for THP_DISABLE. This discovered a broken commit in Google's kernel tree. Signed-off-by: Michał Mirosław --- test/zdtm/static/thp_disable.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index e385087788..eabb45650d 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -47,6 +47,21 @@ int main(int argc, char **argv) return -1; } + test_msg("Fetch pre-migration flags/adv\n"); + if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) + return -1; + + errno = 0; + if (orig_flags != new_flags) { + fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); + return -1; + } + + if (orig_madv != new_madv) { + fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); + return -1; + } + test_daemon(); test_waitsig(); From c6ee1ba05e7bc88d9fb2de9643a0dfd25379a31c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 13 Oct 2021 06:32:43 +0200 Subject: [PATCH 031/321] Fill FPU init state if it's not provided by kernel. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apparently Skylake uses init-optimization when saving FPU state, and ptrace() returns XSTATE_BV[0] = 0 meaning FPU was not used by a task (in init state). Since CRIU restore uses sigreturn to restore registers, FPU state is always restored. Fill the state with default values on dump to make restore happy. Signed-off-by: Michał Mirosław --- compel/arch/x86/src/lib/infect.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 01959b95b2..88bdb4047e 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -220,6 +220,16 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr #define get_signed_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : (int32_t)((pregs)->compat.name)) +static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) +{ + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; +} + static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) { struct iovec iov; @@ -232,14 +242,15 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) return -1; } - return 0; -} - -static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) -{ - if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { - pr_perror("Can't obtain FPU registers for %d", pid); - return -1; + if ((xsave->xsave_hdr.xstate_bv & 3) != 3) { + // Due to init-optimisation [1] x87 FPU or SSE state may not be filled in. + // Since those are restored unconditionally, make sure the init values are + // filled by retrying with old PTRACE_GETFPREGS. + // + // [1] Intel® 64 and IA-32 Architectures Software Developer's + // Manual Volume 1: Basic Architecture + // Section 13.6: Processor tracking of XSAVE-managed state + return get_task_fpregs(pid, xsave); } return 0; From 9e2e56006b8e59d1aee437da01dbc3659e285e25 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Thu, 15 Jun 2023 15:13:58 -0700 Subject: [PATCH 032/321] compel/test: Return 0 in case of error in fdspy This commit revises the error handling in the fdspy test. Previously, a failure case could have been incorrectly reported as successful because of a specific check `pass != 0`, leading to potential false positives when `check_pipe_ends()` returned `-1` due to a read/write pipe error. To improve this, we've adjusted the error handling to return `0` in case of any error. As such, the final success condition remains unchanged. This approach will help accurately differentiate between successful and failed cases, ensuring the output "All OK" is printed for success, and "Something went WRONG" for any failure. Fixes: 5364ca3 ("compel/test: Fix warn_unused_result") Signed-off-by: Haorong Lu --- compel/test/fdspy/spy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compel/test/fdspy/spy.c b/compel/test/fdspy/spy.c index 7f20ea2a7f..41de99e200 100644 --- a/compel/test/fdspy/spy.c +++ b/compel/test/fdspy/spy.c @@ -110,11 +110,11 @@ static int check_pipe_ends(int wfd, int rfd) printf("Check pipe ends are connected\n"); if (write(wfd, "1", 2) != 2) { fprintf(stderr, "write to pipe failed\n"); - return -1; + return 0; } if (read(rfd, aux, sizeof(aux)) != sizeof(aux)) { fprintf(stderr, "read from pipe failed\n"); - return -1; + return 0; } if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); From 3d4d943253483bd6d2c16da75daaa0eb3d4d8b66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 16 Sep 2022 11:01:48 +0200 Subject: [PATCH 033/321] Allow passing --leave_stopped by RPC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/cr-service.c | 3 +++ images/rpc.proto | 1 + 2 files changed, 4 insertions(+) diff --git a/criu/cr-service.c b/criu/cr-service.c index 314c309be9..ed4f1edef6 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -428,6 +428,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; + if (req->has_leave_stopped && req->leave_stopped) + opts.final_state = TASK_STOPPED; + if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; diff --git a/images/rpc.proto b/images/rpc.proto index afd2c7b43f..6451e9b734 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -140,6 +140,7 @@ message criu_opts { optional bool mntns_compat_mode = 65; optional bool skip_file_rwx_check = 66; optional bool unprivileged = 67; + optional bool leave_stopped = 69; /* optional bool check_mounts = 128; */ } From 161a5ff8d4f6be1a45afd10581287eeda48f5aef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20C=C5=82api=C5=84ski?= Date: Fri, 16 Sep 2022 11:06:28 +0200 Subject: [PATCH 034/321] Allow passing --display_stats via RPC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/cr-service.c | 3 +++ images/rpc.proto | 1 + 2 files changed, 4 insertions(+) diff --git a/criu/cr-service.c b/criu/cr-service.c index ed4f1edef6..9aa9d5bc8a 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -723,6 +723,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->orphan_pts_master) opts.orphan_pts_master = true; + if (req->has_display_stats) + opts.display_stats = req->display_stats; + /* Evaluate additional configuration file a second time to overwrite * all RPC settings. */ if (req->config_file) { diff --git a/images/rpc.proto b/images/rpc.proto index 6451e9b734..cde162f1c2 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -141,6 +141,7 @@ message criu_opts { optional bool skip_file_rwx_check = 66; optional bool unprivileged = 67; optional bool leave_stopped = 69; + optional bool display_stats = 70; /* optional bool check_mounts = 128; */ } From 7f7b553af3d87a4573ac54b06f701c304cdb347f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20C=C5=82api=C5=84ski?= Date: Fri, 16 Sep 2022 11:14:59 +0200 Subject: [PATCH 035/321] Allow passing --log_to_stderr via RPC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/cr-service.c | 3 +++ images/rpc.proto | 1 + 2 files changed, 4 insertions(+) diff --git a/criu/cr-service.c b/criu/cr-service.c index 9aa9d5bc8a..915ba38709 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -394,6 +394,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; } else if (!opts.output) { SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); } diff --git a/images/rpc.proto b/images/rpc.proto index cde162f1c2..79623f9f6c 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -142,6 +142,7 @@ message criu_opts { optional bool unprivileged = 67; optional bool leave_stopped = 69; optional bool display_stats = 70; + optional bool log_to_stderr = 71; /* optional bool check_mounts = 128; */ } From e55e168e9049b762963994f7ebc3ad2692668f1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 21 Apr 2023 15:56:06 +0200 Subject: [PATCH 036/321] zdtm: Allow overriding /tmp. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use $TMPDIR for tests_root as the host's /tmp might not have enough features or space. Signed-off-by: Michał Mirosław --- test/zdtm.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index 33859f61eb..2a657e44dc 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -79,7 +79,8 @@ def clean_tests_root(): def make_tests_root(): global tests_root if not tests_root: - tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", "/tmp")) + tmpdir = os.environ.get("TMPDIR", "/tmp") + tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", tmpdir)) atexit.register(clean_tests_root) os.mkdir(os.path.join(tests_root[1], "root")) os.chmod(tests_root[1], 0o777) @@ -404,7 +405,7 @@ def __init__(self, name, desc, flavor, freezer, rootless): self.__flavor = flavor self.__freezer = freezer self._bins = [name] - self._env = {} + self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) self.auto_reap = True self.__timeout = int(self.__desc.get('timeout') or 30) @@ -828,7 +829,7 @@ def __init__(self, name, desc, flavor, freezer, rootless): self._bins += self.__subs self._deps += get_test_desc('zdtm/lib/groups')['deps'] - self._env = {'ZDTM_TESTS': self.__real_name} + self._env['ZDTM_TESTS'] = self.__real_name def __get_start_cmd(self, name): tdir = os.path.dirname(name) From 0bd5abe4ed331c96fcc6d4da5fc4a60ece0045e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 25 Apr 2023 21:30:20 +0200 Subject: [PATCH 037/321] zdtm: Add timeouts for test commands. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend ability to limit time taken to all CRIU invocations. Signed-off-by: Michał Mirosław --- test/zdtm.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index 2a657e44dc..b8a0c5a3bd 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -399,6 +399,7 @@ def __init__(self, name, desc, flavor, freezer, rootless): self.__name = name self.__desc = desc self.__freezer = None + self.__timeout = int(self.__desc.get('timeout') or 30) self.__rootless = rootless self.__make_action('cleanout') self.__pid = 0 @@ -408,7 +409,6 @@ def __init__(self, name, desc, flavor, freezer, rootless): self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) self.auto_reap = True - self.__timeout = int(self.__desc.get('timeout') or 30) def __make_action(self, act, env=None, root=None): sys.stdout.flush() # Not to let make's messages appear before ours @@ -430,7 +430,7 @@ def __make_action(self, act, env=None, root=None): preexec_fn=self.__freezer and self.__freezer.attach or None) if act == "pid": try_run_hook(self, ["--post-start"]) - if s.wait(): + if s.wait(timeout=self.__timeout): raise test_fail_exc(str(s_args)) if self.__freezer: @@ -839,7 +839,7 @@ def __get_start_cmd(self, name): subprocess.check_call(s_args + [tname + '.cleanout']) s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout=subprocess.PIPE) - out, _ = s.communicate() + out, _ = s.communicate(timeout=self.__timeout) cmd = out.decode().splitlines()[-1].strip() return 'cd /' + tdir + ' && ' + cmd @@ -883,7 +883,8 @@ def run(action, fault=None, strace=[], preexec=None, - nowait=False): + nowait=False, + timeout=60): env = dict( os.environ, ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0") @@ -899,7 +900,7 @@ def run(action, preexec_fn=preexec) if nowait: return cr - return cr.wait() + return cr.wait(timeout=timeout) class criu_rpc_process: @@ -982,7 +983,8 @@ def run(action, fault=None, strace=[], preexec=None, - nowait=False): + nowait=False, + timeout=None): if fault: raise test_fail_exc('RPC and FAULT not supported') if strace: From 05f2319f1e998eb8ea69f8de2d6e345ab682653b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 22 May 2023 20:20:18 +0200 Subject: [PATCH 038/321] zdtm: Allow --keep-going for single test. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't want test framework to change its behaviour on whether we run a single or multiple tests in a run. When we shard the test suite it can result in some shards having a single test to run and unexpectedly change the test output format. Signed-off-by: Michał Mirosław --- test/zdtm.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index b8a0c5a3bd..c278fafff6 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2344,11 +2344,6 @@ def run_tests(opts): return torun = list(torun) - if opts['keep_going'] and len(torun) < 2: - print( - "[WARNING] Option --keep-going is more useful when running multiple tests" - ) - opts['keep_going'] = False if opts['exclude']: excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") From e595787cf727504a942cb77f71bda0eaf93a4c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 29 May 2023 17:16:15 +0200 Subject: [PATCH 039/321] zdtm: Implement test sharding. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow to split test suite into predictable sets to parallelize runs on multiple machines or VMs. Signed-off-by: Michał Mirosław --- test/zdtm.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index c278fafff6..1ef941b4e8 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2231,9 +2231,21 @@ def all_tests(opts): continue files.append(fp) excl = list(map(lambda x: os.path.join(desc['dir'], x), desc['exclude'])) - tlist = list(filter( + tlist = list(sorted(filter( lambda x: not x.endswith('.checkskip') and not x.endswith('.hook') and - x not in excl, map(lambda x: x.strip(), files))) + x not in excl, map(lambda x: x.strip(), files)))) + + if opts.get('test_shard_count'): + if opts.get('test_shard_index') is None: + raise KeyError('--test_shard_count > 0 must come with --test_shard_index') + slice_idx = opts['test_shard_index'] + slices = opts['test_shard_count'] + if slice_idx >= slices: + raise IndexError('--test_shard_index not less than --test_shard_count ({} >= {})'.format(slice_idx, slices)) + slist = list(tlist[slice_idx::slices]) + print("We're shard #{} of {}. Running {} of {} tests.\n".format(slice_idx, slices, len(slist), len(tlist))) + tlist = slist + return tlist @@ -2765,6 +2777,10 @@ def get_cli_args(): rp.add_argument("--mntns-compat-mode", help="Use old compat mounts restore engine", action='store_true') + rp.add_argument("--test-shard-index", type=int, default=None, + help="Select tests for a shard (0-based)") + rp.add_argument("--test-shard-count", type=int, default=0, + help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) From 9301aba877b82b4779ea8cde49999131c1bc7853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 18 May 2023 12:15:51 +0200 Subject: [PATCH 040/321] zdtm: sock_opts00: Improve error messages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it clear that the option numbers are indexes not the option identifiers ("names"). Also show the value change that prompted test failure. Signed-off-by: Michał Mirosław --- test/zdtm/static/sock_opts00.c | 51 +++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index 5b4624f6de..fcf00ffed8 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -12,22 +12,28 @@ const char *test_author = "Pavel Emelyanov "; #define TEST_PORT 59687 #define TEST_ADDR INADDR_ANY -#define NOPTS 8 - int main(int argc, char **argv) { - int sock, ret = 0, vname[NOPTS], val[NOPTS], rval, i; + #define OPT(x) { x, #x } + static const struct { + int opt; + const char *name; + } vname[] = { + OPT(SO_PRIORITY), + OPT(SO_RCVLOWAT), + OPT(SO_MARK), + OPT(SO_PASSCRED), + OPT(SO_PASSSEC), + OPT(SO_DONTROUTE), + OPT(SO_NO_CHECK), + OPT(SO_OOBINLINE), + }; + static const int NOPTS = sizeof(vname) / sizeof(*vname); + #undef OPT + + int sock, ret = 0, val[NOPTS], rval, i; socklen_t len = sizeof(int); - vname[0] = SO_PRIORITY; - vname[1] = SO_RCVLOWAT; - vname[2] = SO_MARK; - vname[3] = SO_PASSCRED; - vname[4] = SO_PASSSEC; - vname[5] = SO_DONTROUTE; - vname[6] = SO_NO_CHECK; - vname[7] = SO_OOBINLINE; - test_init(argc, argv); sock = socket(PF_INET, SOCK_STREAM, 0); @@ -37,29 +43,29 @@ int main(int argc, char **argv) } for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &val[i], &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { - pr_perror("can't get option %d", i); + pr_perror("can't get %s", vname[i].name); return 1; } val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i], &val[i], len); + ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { - pr_perror("can't set option %d", i); + pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d 2", i); + pr_perror("can't re-get %s", vname[i].name); return 1; } if (rval != val[i]) { if (rval + 1 == val[i]) { - pr_perror("can't reset option %d want %d have %d", i, val[i], rval); + pr_perror("failed to set %s: want %d have %d", vname[i].name, val[i], rval); return 1; } @@ -72,14 +78,15 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d again", i); + pr_perror("can't verify %s", vname[i].name); return 1; } if (val[i] != rval) { - fail("option %d changed", i); + errno = 0; + fail("%s changed: %d -> %d", vname[i].name, val[i], rval); return 1; } } From a2c4dd2265701285c6d74dc99d9f7f0ad29f122a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 12 Sep 2022 16:17:43 +0200 Subject: [PATCH 041/321] Allow skipping iptables/nftables invocation. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it possible to skip network lock to enable uses that break connections anyway to work without iptables/nftables being present. Signed-off-by: Michał Mirosław --- Documentation/criu.txt | 3 +++ criu/config.c | 2 ++ criu/cr-service.c | 3 +++ criu/include/cr_options.h | 1 + criu/net.c | 6 ++++++ criu/sk-tcp.c | 6 ++++++ images/rpc.proto | 1 + lib/c/criu.c | 2 +- lib/c/criu.h | 1 + 9 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 0e7d19c4cd..0c4cf8b615 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -457,6 +457,9 @@ The 'mode' may be one of the following: *nftables*::: Use nftables rules to drop the packets. + *skip*::: Don't lock the network. If *--tcp-close* is not used, the network + must be locked externally to allow CRIU to dump TCP connections. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. diff --git a/criu/config.c b/criu/config.c index 9f02ae9928..1322a490ab 100644 --- a/criu/config.c +++ b/criu/config.c @@ -1036,6 +1036,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, opts.network_lock_method = NETWORK_LOCK_IPTABLES; } else if (!strcmp("nftables", optarg)) { opts.network_lock_method = NETWORK_LOCK_NFTABLES; + } else if (!strcmp("skip", optarg) || !strcmp("none", optarg)) { + opts.network_lock_method = NETWORK_LOCK_SKIP; } else { pr_err("Invalid value for --network-lock: %s\n", optarg); return 1; diff --git a/criu/cr-service.c b/criu/cr-service.c index 915ba38709..fa74903704 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -526,6 +526,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) case CRIU_NETWORK_LOCK_METHOD__NFTABLES: opts.network_lock_method = NETWORK_LOCK_NFTABLES; break; + case CRIU_NETWORK_LOCK_METHOD__SKIP: + opts.network_lock_method = NETWORK_LOCK_SKIP; + break; default: goto err; } diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index c7e98c756c..60cf9437e6 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -67,6 +67,7 @@ struct cg_root_opt { enum NETWORK_LOCK_METHOD { NETWORK_LOCK_IPTABLES, NETWORK_LOCK_NFTABLES, + NETWORK_LOCK_SKIP, }; #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES diff --git a/criu/net.c b/criu/net.c index 84250598c0..4abfc182a8 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3131,6 +3131,9 @@ int network_lock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; @@ -3193,6 +3196,9 @@ static int network_unlock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 96d5d13bf6..630a182a27 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -39,6 +39,8 @@ static int lock_connection(struct inet_sk_desc *sk) return iptables_lock_connection(sk); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) return nftables_lock_connection(sk); + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -50,6 +52,8 @@ static int unlock_connection(struct inet_sk_desc *sk) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -483,6 +487,8 @@ static int unlock_connection_info(struct inet_sk_info *si) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } diff --git a/images/rpc.proto b/images/rpc.proto index 79623f9f6c..8748bdaff7 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -52,6 +52,7 @@ enum criu_cg_mode { enum criu_network_lock_method { IPTABLES = 1; NFTABLES = 2; + SKIP = 3; }; enum criu_pre_dump_mode { diff --git a/lib/c/criu.c b/lib/c/criu.c index 0095bcc9bc..7f766db857 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -1868,7 +1868,7 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { opts->rpc->has_network_lock = true; - if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES) { + if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } diff --git a/lib/c/criu.h b/lib/c/criu.h index 3b9cedfd09..c1c6078698 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -50,6 +50,7 @@ enum criu_cg_mode { enum criu_network_lock_method { CRIU_NETWORK_LOCK_IPTABLES = 1, CRIU_NETWORK_LOCK_NFTABLES = 2, + CRIU_NETWORK_LOCK_SKIP = 3, }; enum criu_pre_dump_mode { CRIU_PRE_DUMP_SPLICE = 1, CRIU_PRE_DUMP_READ = 2 }; From f018893d261d649edeca57de5763b65b159800be Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 18 Jun 2023 09:53:48 +0200 Subject: [PATCH 042/321] test/thp_disable: fix lint The fail() macro provides a new line character at the end of the message. This patch fixes the following lint check that currently fails in CI: $ git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' test/zdtm/static/thp_disable.c: fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); test/zdtm/static/thp_disable.c: fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); test/zdtm/static/thp_disable.c: fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); test/zdtm/static/thp_disable.c: fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); test/zdtm/static/thp_disable.c: fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); test/zdtm/static/thp_disable.c: fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); Fixes: #2193 Signed-off-by: Radostin Stoyanov --- criu/kerndat.c | 2 +- test/zdtm/static/thp_disable.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index d38e8898ef..4565e53077 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1325,7 +1325,7 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); if (!kdat.has_thp_disable) - pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE"); + pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE\n"); break; } } diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index eabb45650d..55609f2605 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -43,7 +43,7 @@ int main(int argc, char **argv) } if (ret != 1) { errno = 0; - fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); + fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); return -1; } @@ -53,12 +53,12 @@ int main(int argc, char **argv) errno = 0; if (orig_flags != new_flags) { - fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); + fail("Flags changed %lx -> %lx", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); return -1; } @@ -72,7 +72,7 @@ int main(int argc, char **argv) } if (ret != 1) { errno = 0; - fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); + fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); return -1; } @@ -87,12 +87,12 @@ int main(int argc, char **argv) errno = 0; if (orig_flags != new_flags) { - fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); + fail("Flags changed %lx -> %lx", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); return -1; } From e6427c5600867cd6a5d1d0f03bee1c25f6771b88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 22:40:10 +0200 Subject: [PATCH 043/321] sockets: Increase the size of sockets hashmap to 16K. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During dump, CRIU stores the structs representing sockets in a statically sized hashmap of size 32. We have some (admittedly crazy) tasks that use tens of thousands of sockets, and seem to spend most of the dump time iterating over the linked lists of the map. 16K is chosen arbitrarily, so that it reduces the lengths of the chains to few elements on average, while not introducing significant memory overhead. From: Radosław Burny Signed-off-by: Michał Mirosław --- criu/sockets.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/sockets.c b/criu/sockets.c index d17e0a9869..560c765175 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -38,7 +38,7 @@ #define SOCK_DIAG_BY_FAMILY 20 #endif -#define SK_HASH_SIZE 32 +#define SK_HASH_SIZE (1 << 14) #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER From 12290f4583d8810b6692eb501cf8336a590ffd53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 22:38:31 +0200 Subject: [PATCH 044/321] pipes: Plug pipe fd leak in "Unable to set a pipe size" error case. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Piotr Figiel Signed-off-by: Michał Mirosław --- criu/pipes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/pipes.c b/criu/pipes.c index 43ff06e3d8..daada88306 100644 --- a/criu/pipes.c +++ b/criu/pipes.c @@ -434,7 +434,7 @@ int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms /* steal_pipe has to be able to fit all data from a target pipe */ if (fcntl(steal_pipe[1], F_SETPIPE_SZ, pipe_size) < 0) { pr_perror("Unable to set a pipe size"); - goto err; + goto err_close; } bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); From f5cd44f79737fb91cacd28bd4b6d0abdca1e180b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 18:54:28 +0200 Subject: [PATCH 045/321] kerndat: Make socket feature probing work on IPv6-only host. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Try IPv6 if IPv4 sockets are not supported. Signed-off-by: Michał Mirosław --- criu/cr-check.c | 2 ++ criu/kerndat.c | 51 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index a4166f76ba..cb083b16ca 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1086,6 +1086,8 @@ static int kerndat_tcp_repair_window(void) int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0 && errno == EAFNOSUPPORT) + sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; diff --git a/criu/kerndat.c b/criu/kerndat.c index 4565e53077..b2e39cb405 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -12,7 +12,7 @@ #include #include #include -#include /* for sockaddr_in and inet_ntoa() */ +#include #include #include #include @@ -615,29 +615,52 @@ static int kerndat_iptables_has_xtlocks(void) return 0; } -int kerndat_tcp_repair(void) +/* + * Unfortunately in C htonl() is not constexpr and cannot be used in a static + * initialization below. + */ +#define constant_htonl(x) \ + (__BYTE_ORDER == __BIG_ENDIAN ? (x) : \ + (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) + +static int kerndat_tcp_repair(void) { + static const struct sockaddr_in loopback_ip4 = { + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { constant_htonl(INADDR_LOOPBACK) }, + }; + static const struct sockaddr_in6 loopback_ip6 = { + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; int sock, clnt = -1, yes = 1, exit_code = -1; - struct sockaddr_in addr; - socklen_t aux; + const struct sockaddr *addr; + struct sockaddr_storage listener_addr; + socklen_t addrlen; - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); - addr.sin_port = 0; + addr = (const struct sockaddr *)&loopback_ip4; + addrlen = sizeof(loopback_ip4); sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) { + addr = (const struct sockaddr *)&loopback_ip6; + addrlen = sizeof(loopback_ip6); + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); + } if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } - if (bind(sock, (struct sockaddr *)&addr, sizeof(addr))) { + if (bind(sock, addr, addrlen)) { pr_perror("Unable to bind a socket"); goto err; } - aux = sizeof(addr); - if (getsockname(sock, (struct sockaddr *)&addr, &aux)) { + addrlen = sizeof(listener_addr); + if (getsockname(sock, (struct sockaddr *)&listener_addr, &addrlen)) { pr_perror("Unable to get a socket name"); goto err; } @@ -647,13 +670,13 @@ int kerndat_tcp_repair(void) goto err; } - clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + clnt = socket(addr->sa_family, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } - if (connect(clnt, (struct sockaddr *)&addr, sizeof(addr))) { + if (connect(clnt, (const struct sockaddr *)&listener_addr, addrlen)) { pr_perror("Unable to connect a socket"); goto err; } @@ -977,6 +1000,8 @@ int kerndat_sockopt_buf_lock(void) int sock; sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { pr_perror("Unable to create a socket"); return -1; From 1e90fc8f4b4b2c8457470e01697135914b5e2296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 18:23:16 +0200 Subject: [PATCH 046/321] restore: remove unused `secbits` field. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/include/restorer.h | 1 - 1 file changed, 1 deletion(-) diff --git a/criu/include/restorer.h b/criu/include/restorer.h index e232f54040..2475ee0bcb 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -72,7 +72,6 @@ struct thread_creds_args { u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; - unsigned int secbits; char *lsm_profile; unsigned int *groups; char *lsm_sockcreate; From aa6b633912565f99e242df7839c6e1ece7c03673 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Jun 2022 20:36:31 +0200 Subject: [PATCH 047/321] build: Remove HAS_MEMFD test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test for HAS_MEMFD is empty and noit used. Remove it. Fixes: 5ee1ac1f28e6 ("criu: remove FEATURE_TEST_MEMFD") Change-Id: I43b8f0cfd50ce9bdf93dafb647377318df1deae8 Signed-off-by: Michał Mirosław --- Makefile.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.config b/Makefile.config index 270ec61c0f..a13165aa7e 100644 --- a/Makefile.config +++ b/Makefile.config @@ -78,7 +78,7 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW MEMFD_CREATE \ + SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name From 2d76e4b31a91424d9d8c6533d2c0aaa38bf8f639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Jun 2022 20:36:30 +0200 Subject: [PATCH 048/321] build: Debug system feature tests. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `make` without `-s` option will normally show the commands executed. In the case of detecting build environment features current makefile will cause detected features to be seen as 'echo #define' commands, but not detected ones will be silent. Change it so that all tried features can be seen (outside of make's silent mode) regardless of detection result. Signed-off-by: Michał Mirosław --- Makefile.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.config b/Makefile.config index a13165aa7e..8f2b5208e0 100644 --- a/Makefile.config +++ b/Makefile.config @@ -85,7 +85,8 @@ FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ - $(Q) echo '' >> $$@ +else + $(Q) echo '// CONFIG_HAS_$(1) is not set' >> $$@ endif endef From 722a90ccd9b465c3af62e66a02ffa36533f5d013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Jun 2022 21:00:20 +0200 Subject: [PATCH 049/321] build: Fix LIBS vs LDFLAGS order. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit $LDFLAGS can contain `-Ldir`s that are required by '-lib's in $LIBS. Reverse the order so that `-L` options make effect. Signed-off-by: Michał Mirosław --- criu/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/Makefile b/criu/Makefile index 55bdb1b7a3..c6050d5826 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -85,7 +85,7 @@ $(obj)/%: pie $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) $(GMONLDOPT) -rdynamic -o $@ UNIT-BUILTINS += $(obj)/util.o UNIT-BUILTINS += $(obj)/config.o @@ -102,7 +102,7 @@ $(obj)/unittest/built-in.o: .FORCE $(obj)/unittest/unittest: $(UNIT-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) -rdynamic -o $@ unittest: $(obj)/unittest/unittest $(Q) $(obj)/unittest/$@ From 85f53bdecdc79d604d9c9fceae5dce8b3f96933f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 21 Apr 2023 15:51:41 +0200 Subject: [PATCH 050/321] build: Use make-provided AR for building libzdtmtst. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make $(AR) used also for libzdtmtst build. Signed-off-by: Michał Mirosław --- test/zdtm/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 90bd28f9e1..b574e1d3e7 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -34,4 +34,4 @@ clean: clean-more $(LIB): $(LIBOBJ) $(E) " AR " $@ - $(Q)ar rcs $@ $^ + $(Q)$(AR) rcs $@ $^ From eece28d29c910374a5ead93cccdc1ae105587e7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Jun 2022 20:36:28 +0200 Subject: [PATCH 051/321] build: Guard against libbsd's version of `__aligned`. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When trying to build CRIU with libbsd enabled the compilation fails due to duplicate definition of __aligned macro. Other such definitions are already wrapped with #ifndef make __aligned definition consistent and make it easier in the future to use the libbsd features if needed. Signed-off-by: Michał Mirosław --- include/common/compiler.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/common/compiler.h b/include/common/compiler.h index bd3de01df1..1c9d3db8d6 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -47,7 +47,9 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __aligned #define __aligned(x) __attribute__((aligned(x))) +#endif /* * Macro to define stack alignment. From 0588c3b21a17f063437cff22c15ca12801ed711b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 18:34:58 +0200 Subject: [PATCH 052/321] build: libnfnetlink: Remove nla_get_s32(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nla_get_s32() was added to libnl 3.2.7 in 2015. Remove CRIU's definition as it breaks build when statically linking the binary. From: Uros Prestor Signed-off-by: Michał Mirosław --- criu/libnetlink.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/criu/libnetlink.c b/criu/libnetlink.c index f0304b0dbc..c7a84a44d3 100644 --- a/criu/libnetlink.c +++ b/criu/libnetlink.c @@ -214,8 +214,3 @@ int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], in return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } - -int32_t nla_get_s32(const struct nlattr *nla) -{ - return *(const int32_t *)nla_data(nla); -} From b02d53a8f41c471f88c8487c58b8a5bb2a9a55a3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 24 Jun 2023 11:39:21 +0100 Subject: [PATCH 053/321] action-scripts: allow shell scripts in rpc mode Container runtimes commonly use CRIU with RPC. However, this prevents the use of action-scripts set in a CRIU configuration file due to the explicit scripts mode introduced with the following commit: ac78f13bdfaee260dd4234f054bf4c5d2a373783 actions: Introduce explicit scripts mode This patch enables container checkpoint/restore with action-scripts specified via configuration file. Signed-off-by: Radostin Stoyanov --- criu/action-scripts.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/criu/action-scripts.c b/criu/action-scripts.c index ec0563e162..1c9a8f0914 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -52,6 +52,9 @@ static int run_shell_scripts(const char *action) #define ENV_IMGDIR 0x1 #define ENV_ROOTPID 0x2 + if (list_empty(&scripts)) + return 0; + if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); return -1; @@ -119,23 +122,24 @@ int run_scripts(enum script_actions act) pr_debug("Running %s scripts\n", action); - if (scripts_mode == SCRIPTS_NONE) + switch (scripts_mode) { + case SCRIPTS_NONE: return 0; - - if (scripts_mode == SCRIPTS_RPC) { + case SCRIPTS_RPC: ret = rpc_send_fd(act, -1); - goto out; - } - - if (scripts_mode == SCRIPTS_SHELL) { + if (ret) + break; + /* Enable scripts from config file in RPC mode (fallthrough) */ + case SCRIPTS_SHELL: ret = run_shell_scripts(action); - goto out; + break; + default: + BUG(); } - BUG(); -out: if (ret) pr_err("One of more action scripts failed\n"); + return ret; } @@ -143,8 +147,9 @@ int add_script(char *path) { struct script *script; - BUG_ON(scripts_mode == SCRIPTS_RPC); - scripts_mode = SCRIPTS_SHELL; + /* Set shell mode when a script is added but don't overwrite RPC mode */ + if (scripts_mode == SCRIPTS_NONE) + scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) @@ -170,7 +175,6 @@ int add_rpc_notify(int sk) return -1; } - BUG_ON(scripts_mode == SCRIPTS_SHELL); scripts_mode = SCRIPTS_RPC; if (install_service_fd(RPC_SK_OFF, fd) < 0) From cde9bcf63d5b397ccc08e8966f510828eed0c0df Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 24 Jun 2023 11:58:47 +0100 Subject: [PATCH 054/321] docker/podman: test c/r with action-script Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 3 +++ scripts/ci/podman-test.sh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index beb7da6da6..bd46d5dd31 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -28,6 +28,9 @@ CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf + export SKIP_CI_TEST=1 ./run-ci-tests.sh diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 687acb8ff5..72ad59a501 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -17,6 +17,9 @@ mkdir -p /etc/criu echo "manage-cgroups ignore" > /etc/criu/runc.conf sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers.conf +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf + podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' From 8ba6efb0056a57a6b3f3abd2173f91d765e13a20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 31 Mar 2022 06:59:34 -0700 Subject: [PATCH 055/321] rpc: Support gathering external file list after freezing process tree. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New 'query-ext-files' action for `criu dump` is sent after freezing the process tree. This allows to defer gathering the external file list when the process tree is in a stable state and avoids race with the process creating and deleting files. Change-Id: Iae32149dc3992dea086f513ada52cf6863beaa1f Signed-off-by: Michał Mirosław --- Documentation/criu.txt | 5 ++++ criu/action-scripts.c | 15 ++++++++++++ criu/cr-dump.c | 3 +++ criu/cr-service.c | 43 +++++++++++++++++++++++++++++++++++ criu/include/action-scripts.h | 3 +++ 5 files changed, 69 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 0c4cf8b615..606935790b 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -155,6 +155,11 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty + *query-ext-files*::: + called after the process tree is stopped and network is locked. + This hook is used only in the RPC mode. The notification reply + contains file ids to be added to external file list (may be empty). + *--unprivileged*:: This option tells *criu* to accept the limitations when running as non-root. Running as non-root requires *criu* at least to have diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 1c9a8f0914..6f79001864 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -31,6 +31,7 @@ static const char *action_names[ACT_MAX] = { [ACT_POST_RESUME] = "post-resume", [ACT_ORPHAN_PTS_MASTER] = "orphan-pts-master", [ACT_STATUS_READY] = "status-ready", + [ACT_QUERY_EXT_FILES] = "query-ext-files", }; struct script { @@ -115,6 +116,20 @@ int rpc_send_fd(enum script_actions act, int fd) return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } +int rpc_query_external_files(void) +{ + int rpc_sk; + + if (scripts_mode != SCRIPTS_RPC) + return 0; + + rpc_sk = get_service_fd(RPC_SK_OFF); + if (rpc_sk < 0) + return -1; + + return exec_rpc_query_external_files((char *)action_names[ACT_QUERY_EXT_FILES], rpc_sk); +} + int run_scripts(enum script_actions act) { int ret = 0; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 90d763f497..340fb96ecd 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2180,6 +2180,9 @@ int cr_dump_tasks(pid_t pid) if (network_lock()) goto err; + if (rpc_query_external_files()) + goto err; + if (collect_file_locks()) goto err; diff --git a/criu/cr-service.c b/criu/cr-service.c index fa74903704..f62245d5fa 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -240,6 +240,49 @@ int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) return 0; } +int exec_rpc_query_external_files(char *name, int sk) +{ + int i, ret; + CriuNotify cn = CRIU_NOTIFY__INIT; + CriuResp msg = CRIU_RESP__INIT; + CriuReq *req; + + cn.script = name; + + msg.type = CRIU_REQ_TYPE__NOTIFY; + msg.success = true; + msg.notify = &cn; + + ret = send_criu_msg_with_fd(sk, &msg, -1); + if (ret < 0) + return ret; + + ret = recv_criu_msg(sk, &req); + if (ret < 0) + return ret; + + if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { + pr_err("RPC client reported script error\n"); + return -1; + } + + ret = 0; + if (req->opts) + for (i = 0; i < req->opts->n_external; i++) { + char *key = req->opts->external[i]; + pr_info("Adding external object: %s\n", key); + if (add_external(key)) { + pr_err("Failed to add external object: %s\n", key); + ret = -1; + } + } + else + pr_info("RPC NOTIFY %s: no `opts` returned.\n", name); + + criu_req__free_unpacked(req, NULL); + return ret; +} + static char images_dir[PATH_MAX]; static int setup_opts_from_req(int sk, CriuOpts *req) diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index 793698c27c..6a331a32f8 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -17,6 +17,7 @@ enum script_actions { ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, ACT_STATUS_READY, + ACT_QUERY_EXT_FILES, ACT_MAX }; @@ -25,6 +26,8 @@ extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); +extern int rpc_query_external_files(void); +extern int exec_rpc_query_external_files(char *name, int sk); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ From 6cfe7aa1142093f93a38a127a8a063e1470bca65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:38:33 +0200 Subject: [PATCH 056/321] rpc: Support setting images_dir by path. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Google's RPC client process is in a different pidns and has more privileges -- CRIU can't open its /proc//fd/. For images_dir_fd to be useful here it would need to refer to a passed or CRIU's fd. From: Michał Cłapiński Change-Id: Icbfb5af6844b21939a15f6fbb5b02264c12341b1 Signed-off-by: Michał Mirosław --- criu/cr-service.c | 8 +++++++- images/rpc.proto | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index f62245d5fa..61a04c5ffe 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -382,8 +382,14 @@ static int setup_opts_from_req(int sk, CriuOpts *req) */ if (imgs_changed_by_rpc_conf) strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else + else if (req->images_dir_fd != -1) sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + else if (req->images_dir) + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + goto err; + } if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); diff --git a/images/rpc.proto b/images/rpc.proto index 8748bdaff7..1a4722a9ce 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -61,7 +61,8 @@ enum criu_pre_dump_mode { }; message criu_opts { - required int32 images_dir_fd = 1; + required int32 images_dir_fd = 1 [default = -1]; + optional string images_dir = 68; /* used only if images_dir_fd == -1 */ optional int32 pid = 2; /* if not set on dump, will dump requesting process */ optional bool leave_running = 3; From fb8ca647f220435c0542d4ebe47ef5c6195518c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:31:57 +0200 Subject: [PATCH 057/321] util: Downgrade ignored errors to warnings. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the error is ignored it is not important enough - make it a warning instead. From: Mian Luo Change-Id: If2641c3d4e0a4d57fdf04e4570c49be55f526535 Signed-off-by: Michał Mirosław --- Makefile | 12 ++++++------ criu/include/log.h | 2 ++ criu/util.c | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 23f68e2f31..c462a9ff0f 100644 --- a/Makefile +++ b/Makefile @@ -450,12 +450,12 @@ lint: shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh codespell -S tags - # Do not append \n to pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' - # Do not use %m with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|debug|info|msg)|fail)\>.*%m' - # Do not use errno with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>\(".*".*errno' + # Do not append \n to pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' + # Do not use %m with pr_* or fail + ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|pwarn|debug|info|msg)|fail)\>.*%m' + # Do not use errno with pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>\(".*".*errno' # End pr_(err|warn|msg|info|debug) with \n ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files diff --git a/criu/include/log.h b/criu/include/log.h index 85e6dc2e72..cbed330076 100644 --- a/criu/include/log.h +++ b/criu/include/log.h @@ -60,6 +60,8 @@ void flush_early_log_buffer(int fd); #define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) +#define pr_pwarn(fmt, ...) pr_warn(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) + #endif /* CR_NOGLIBC */ #endif /* __CR_LOG_H__ */ diff --git a/criu/util.c b/criu/util.c index db96cf938f..a4975b92f4 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1076,14 +1076,14 @@ void tcp_cork(int sk, bool on) { int val = on ? 1 : 0; if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) - pr_perror("Unable to restore TCP_CORK (%d)", val); + pr_pwarn("Unable to restore TCP_CORK (%d)", val); } void tcp_nodelay(int sk, bool on) { int val = on ? 1 : 0; if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) - pr_perror("Unable to restore TCP_NODELAY (%d)", val); + pr_pwarn("Unable to restore TCP_NODELAY (%d)", val); } static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned short port) From f0d1b89f567a620192a754a3dfc4301844c7319a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 21:39:05 +0200 Subject: [PATCH 058/321] kerndat: unexport kerndat_nsid() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kerndat_nsid() is not used outside kerndat.c. Make it static. Change-Id: I52e518ecb7c627cc1866e373411b2be3f71a2c9d Signed-off-by: Michał Mirosław --- criu/include/net.h | 1 - criu/kerndat.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/include/net.h b/criu/include/net.h index 0da4cad137..5e8a848620 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -50,7 +50,6 @@ extern int kerndat_has_newifindex(void); extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); extern struct ns_id *net_get_root_ns(void); -extern int kerndat_nsid(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); extern struct ns_id *get_root_netns(void); diff --git a/criu/kerndat.c b/criu/kerndat.c index b2e39cb405..fbc5b99d00 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -703,7 +703,7 @@ static int kerndat_tcp_repair(void) return exit_code; } -int kerndat_nsid(void) +static int kerndat_nsid(void) { int nsid, sk; From a5939b006c57a81c8bb4d6043668a227461104f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 21:33:16 +0200 Subject: [PATCH 059/321] kerndat: Don't fail on NETLINK/nsid support missing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If not dumping netns nor connections, nsid support is not used. Don't fail the run as if the support is needed, the dumping process will fail later. Change-Id: I39a086756f6d520c73bb6b21eaf6d9fb49a18879 Signed-off-by: Michał Mirosław --- criu/kerndat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index fbc5b99d00..597fe5d925 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -707,16 +707,18 @@ static int kerndat_nsid(void) { int nsid, sk; + kdat.has_nsid = false; + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { - pr_perror("Unable to create a netlink socket"); - return -1; + pr_pwarn("Unable to create a netlink socket: NSID can't be used."); + return 0; } if (net_get_nsid(sk, getpid(), &nsid) < 0) { - pr_err("NSID is not supported\n"); + pr_warn("NSID is not supported\n"); close(sk); - return -1; + return 0; } kdat.has_nsid = true; From ba11426de59ffb02a8cbd00b2ca03740b6c10ef0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 9 Dec 2022 15:24:32 +0100 Subject: [PATCH 060/321] util: Make CRIU run_id machine-level unique. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of relying on chance of CLOCK_MONOTONIC reading being unique, use pid namespace ID that combined with the process ID will make it unique on the machine level. If pidns is not enabled on a kernel we'll get ENOENT, but then CRIU's pid will already be unique. If there is some other error, log it but continue, as the socket clash (if it happens) will result in a failed run anyway. Fixes: 45e048d77a6a (2022-03-31 "criu: generate unique socket names") Fixes: 408a7d82d644 (2022-02-12 "util: add an unique ID of the current criu run") Change-Id: I111c006e1b5b1db8932232684c976a84f4256e49 Signed-off-by: Michał Mirosław --- criu/util.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/util.c b/criu/util.c index a4975b92f4..744ec60325 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1880,11 +1880,16 @@ uint64_t criu_run_id; void util_init(void) { - struct timespec tp; + struct stat statbuf; + + criu_run_id = getpid(); + if (!stat("/proc/self/ns/pid", &statbuf)) + criu_run_id |= (uint64_t)statbuf.st_ino << 32; + else if (errno != ENOENT) + pr_perror("Can't stat /proc/self/ns/pid - CRIU run id might not be unique"); - clock_gettime(CLOCK_MONOTONIC, &tp); - criu_run_id = ((uint64_t)getpid() << 32) + tp.tv_sec + tp.tv_nsec; compel_run_id = criu_run_id; + pr_info("CRIU run id = %#" PRIx64 "\n", criu_run_id); } /* From 4c2b71c37259e52b07b992f5b953e9ca59ae20df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 19 Jun 2023 11:29:22 +0200 Subject: [PATCH 061/321] zdtm: Update netns purpose comment in zdtm_ct. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the parasite socket clash now guaranteed not to happen, the comment becomes obsolete. netns is steel needed though, so update the comment to point at the requirement. Change-Id: I3cfb253cd5c53b91b955fcb001530b4aee5129f4 Signed-off-by: Michał Mirosław --- test/zdtm_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 5e849b904b..44316893da 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -102,7 +102,7 @@ int main(int argc, char **argv) /* * pidns is used to avoid conflicts * mntns is used to mount /proc - * net is used to avoid conflicts of parasite sockets + * net is used to avoid conflicts between network tests */ if (!uid) if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) From 3a932e9115fcc2cd3f90b25b89910e6577b4b88b Mon Sep 17 00:00:00 2001 From: Abhishek Guleri Date: Thu, 6 Jul 2023 16:19:45 +0530 Subject: [PATCH 062/321] readme: refactor asciinema link for video playback Instead of opening the image directly, the commit refactors the asciinema image embedded link to redirect users to the corresponding video. Signed-off-by: Abhishek Guleri --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ff4aa1a239..11d1c490b6 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Pages worth starting with are: - Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) ### Checkpoint and restore of simple loop process -[

](https://asciinema.org/a/232445) +

## Advanced features From 42a5b640f658771d2ddf238045bf4fa54569a64a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 20 Jun 2023 07:51:26 +0100 Subject: [PATCH 063/321] ci: disable CentOS 7 test in Cirrus CI Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 32 ----------------------- scripts/build/Dockerfile.centos7 | 45 -------------------------------- 2 files changed, 77 deletions(-) delete mode 100644 scripts/build/Dockerfile.centos7 diff --git a/.cirrus.yml b/.cirrus.yml index 80f3296fce..8b8212d695 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -120,38 +120,6 @@ task: build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" -task: - name: CentOS 7 based test - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: centos-cloud - image: family/centos-7 - platform: linux - cpu: 4 - memory: 8G - - setup_script: | - # EPEL is needed for python2-future, python2-junit_xml, python-pathlib, python-flake8 and libbsd-devel. - # Do not fail if latest epel repository definition is already installed - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six python-pathlib sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - # Enable user namespaces on CentOS 7 - echo 10000 > /proc/sys/user/max_user_namespaces - # Adapt sudoers to our needs - echo 'root ALL=(ALL:ALL) ALL' | EDITOR='tee -a' visudo - - build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" - task: name: aarch64 build GCC (native) arm_container: diff --git a/scripts/build/Dockerfile.centos7 b/scripts/build/Dockerfile.centos7 deleted file mode 100644 index 21e70ff0eb..0000000000 --- a/scripts/build/Dockerfile.centos7 +++ /dev/null @@ -1,45 +0,0 @@ -FROM centos:7 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm -RUN yum install -y \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - protobuf-python \ - python \ - python-flake8 \ - python-ipaddress \ - python2-future \ - python2-junit_xml \ - python-yaml \ - python-six \ - sudo \ - tar \ - which \ - e2fsprogs \ - python2-pip \ - rubygem-asciidoctor - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) From f8466ca798acd124eebbba2655894ebd2f777879 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 20 Jun 2023 07:58:33 +0100 Subject: [PATCH 064/321] ci: clean up CentOS 7 related tweaks We have disabled CentOS 7 tests in CI. This patch reverts the changes introduced in the following commit: 24bc083653f7d2b984653194e921b1ff32292b3b ci: disable some tests on CentOS 7 Signed-off-by: Radostin Stoyanov --- test/others/ns_ext/run.sh | 2 -- test/others/ns_ext/run_pidns.sh | 3 --- test/others/rpc/run.sh | 10 +--------- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index e416f95e53..4ebe3e2801 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -4,8 +4,6 @@ set -x if [[ "$1" == "pid" ]]; then NS=pid - # CentOS 7 kernels do not have NSpid -> skip this test - grep NSpid /proc/self/status || exit 0 else NS=net fi diff --git a/test/others/ns_ext/run_pidns.sh b/test/others/ns_ext/run_pidns.sh index 08c5bff8e8..db12106e03 100755 --- a/test/others/ns_ext/run_pidns.sh +++ b/test/others/ns_ext/run_pidns.sh @@ -2,9 +2,6 @@ set -e -# CentOS 7 kernels do not have NSpid -> skip this test -grep NSpid /proc/self/status || exit 0 - # This test creates a process in non-host pidns and then dumps it and restores # it into host pidns. We use pid >100000 in non-host pidns to make sure it does # not intersect with some host pid on restore but it is potentially racy so diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index 9be5775872..afd4fb5e33 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -1,14 +1,6 @@ #!/bin/bash -set -ex - -if [ -e /etc/os-release ]; then - . /etc/os-release - if [ "$ID" == "centos" ] && [[ "$VERSION_ID" == "7"* ]];then - echo "Skipping tests on CentOS 7 because they do not work in CI" - exit 0 - fi -fi +set -e CRIU=./criu From e9901cdf2df32de3a05fbb26ec36067326f66135 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 20 Jun 2023 08:34:23 +0100 Subject: [PATCH 065/321] coredump: drop python 2 support This patch reverts changes introduced for Python 2 compatibility in commits: 1c866db (Add new files for running criu-coredump via python 2 or 3) 3180d35 (Add support for python3 in criu-coredump). Signed-off-by: Radostin Stoyanov --- Makefile | 2 +- coredump/{coredump.py => coredump} | 1 + coredump/coredump-python2 | 6 ------ coredump/coredump-python3 | 6 ------ coredump/criu_coredump/coredump.py | 16 ++-------------- test/others/env.sh | 2 +- 6 files changed, 5 insertions(+), 28 deletions(-) rename coredump/{coredump.py => coredump} (98%) mode change 100644 => 100755 delete mode 100755 coredump/coredump-python2 delete mode 100755 coredump/coredump-python3 diff --git a/Makefile b/Makefile index c462a9ff0f..f89903d4a9 100644 --- a/Makefile +++ b/Makefile @@ -441,7 +441,7 @@ lint: flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py flake8 --config=scripts/flake8.cfg crit/setup.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py - flake8 --config=scripts/flake8.cfg coredump/ + flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install diff --git a/coredump/coredump.py b/coredump/coredump old mode 100644 new mode 100755 similarity index 98% rename from coredump/coredump.py rename to coredump/coredump index 88a1b374c6..f70d37c13b --- a/coredump/coredump.py +++ b/coredump/coredump @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import os import sys diff --git a/coredump/coredump-python2 b/coredump/coredump-python2 deleted file mode 100755 index 564c05ce9f..0000000000 --- a/coredump/coredump-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -import coredump - -if __name__ == '__main__': - coredump.main() diff --git a/coredump/coredump-python3 b/coredump/coredump-python3 deleted file mode 100755 index 3032dbadf1..0000000000 --- a/coredump/coredump-python3 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -import coredump - -if __name__ == '__main__': - coredump.main() diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 8ee4026768..0b8a02e0aa 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -35,12 +35,6 @@ from pycriu import images from . import elf - -try: - from itertools import ifilter as filter -except ImportError: - pass - # Some memory-related constants PAGESIZE = 4096 status = { @@ -318,10 +312,7 @@ def _gen_prpsinfo(self, pid): # prpsinfo.pr_psargs has a limit of 80 characters which means it will # fail here if the cmdline is longer than 80 prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] - if (sys.version_info > (3, 0)): - prpsinfo.pr_fname = core["tc"]["comm"].encode() - else: - prpsinfo.pr_fname = core["tc"]["comm"] + prpsinfo.pr_fname = core["tc"]["comm"].encode() nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -581,10 +572,7 @@ class elf_files(ctypes.Structure): setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) - if (sys.version_info > (3, 0)): - setattr(data, "name" + str(i), info.name.encode()) - else: - setattr(data, "name" + str(i), info.name) + setattr(data, "name" + str(i), info.name.encode()) nhdr = elf.Elf64_Nhdr() diff --git a/test/others/env.sh b/test/others/env.sh index 45066f760b..a76207360f 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -13,5 +13,5 @@ fi #export PYTHON CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") crit=$CRIT -CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump-"${PYTHON}") +CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump) criu_coredump=$CRIU_COREDUMP From 91edb5f9ff345a6bd9efc96c3c1a2cff5d2f149e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 20 Jun 2023 08:43:34 +0100 Subject: [PATCH 066/321] crit: drop python 2 support This patch reverts changes introduced with the following commits: 4feb07020dedbf845fc00268d8ca02f4645641cd crit: enable python2 or python3 based crit b78c4e071a42ebe34aac82fa0711df07ed375e2b test: fix crit test and extend it Signed-off-by: Radostin Stoyanov --- .gitignore | 1 - Makefile | 16 ++++------------ crit/Makefile | 13 ------------- crit/{crit-python3 => crit} | 2 +- crit/crit-python2 | 6 ------ lib/Makefile | 2 +- lib/py/cli.py | 1 + test/others/env.sh | 11 +---------- 8 files changed, 8 insertions(+), 44 deletions(-) delete mode 100644 crit/Makefile rename crit/{crit-python3 => crit} (79%) delete mode 100755 crit/crit-python2 diff --git a/.gitignore b/.gitignore index 1ea828bbcd..2f2ab20290 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,6 @@ images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest -crit/crit criu/arch/*/sys-exec-tbl*.c # x86 syscalls-table is not generated !criu/arch/x86/sys-exec-tbl.c diff --git a/Makefile b/Makefile index f89903d4a9..a5c6c5bccf 100644 --- a/Makefile +++ b/Makefile @@ -156,7 +156,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: flog criu lib crit +all: flog criu lib .PHONY: all # @@ -268,26 +268,19 @@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu -crit/Makefile: ; -crit/%: criu .FORCE - $(Q) $(MAKE) $(build)=crit $@ -crit: criu - $(Q) $(MAKE) $(build)=crit all -.PHONY: crit - unittest: $(criu-deps) $(Q) $(MAKE) $(build)=criu unittest .PHONY: unittest # -# Libraries next once crit it ready +# Libraries next once criu is ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; -lib/%: crit .FORCE +lib/%: criu .FORCE $(Q) $(MAKE) $(build)=lib $@ -lib: crit +lib: criu $(Q) $(MAKE) $(build)=lib all .PHONY: lib @@ -300,7 +293,6 @@ clean mrproper: $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) $(build)=lib $@ - $(Q) $(MAKE) $(build)=crit $@ .PHONY: clean mrproper clean-amdgpu_plugin: diff --git a/crit/Makefile b/crit/Makefile deleted file mode 100644 index 988b481b63..0000000000 --- a/crit/Makefile +++ /dev/null @@ -1,13 +0,0 @@ - -all-y += crit - -crit/crit: crit/crit-$(PYTHON) - $(Q) cp $^ $@ -crit: crit/crit -.PHONY: crit - -clean-crit: - $(Q) $(RM) crit/crit -.PHONY: clean-crit -clean: clean-crit -mrproper: clean diff --git a/crit/crit-python3 b/crit/crit similarity index 79% rename from crit/crit-python3 rename to crit/crit index 80467cba72..3b15ca6545 100755 --- a/crit/crit-python3 +++ b/crit/crit @@ -3,4 +3,4 @@ from pycriu import cli if __name__ == '__main__': - cli.main() + cli.main() diff --git a/crit/crit-python2 b/crit/crit-python2 deleted file mode 100755 index b0b7d3c3a0..0000000000 --- a/crit/crit-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -from pycriu import cli - -if __name__ == '__main__': - cli.main() diff --git a/lib/Makefile b/lib/Makefile index ff540fb75d..7ed73f9ab6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -41,7 +41,7 @@ clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc mrproper: clean -install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in +install: lib-c lib-a lib-py lib/c/criu.pc.in $(E) " INSTALL " lib $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 755 lib/c/$(CRIU_SO) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) diff --git a/lib/py/cli.py b/lib/py/cli.py index 5419384c3d..82079c7f49 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from __future__ import print_function import argparse import sys diff --git a/test/others/env.sh b/test/others/env.sh index a76207360f..6d830fb58e 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -2,16 +2,7 @@ CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) criu=$CRIU -if [ $(which python3) ]; then - PYTHON=python3 -elif [ $(which python2) ]; then - PYTHON=python2 -else - echo "FAIL: Neither python3 nor python2" - exit 1 -fi -#export PYTHON -CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") +CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit) crit=$CRIT CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump) criu_coredump=$CRIU_COREDUMP From 02bd1bc7f363a565c896aca1a0ce5a24ca107f3b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 18 Jun 2023 14:20:04 +0200 Subject: [PATCH 067/321] crit: add requirements.txt for pip>=20.1 When building with pip version 20.0.2 or older, the pip install command creates a temporary directory and copies all files from ./crit. This results in the following error message: ModuleNotFoundError: No module named 'pycriu' This error appears because the symlink 'pycriu' uses a relative path that becomes invalid '../lib/py/'. The '--no-build-isolation' option for pip install is needed to enable the use of pre-installed dependencies (e.g., protobuf) during build. The '--ignore-installed' option for pip is needed to avoid an error when crit is already installed. For example, crit is installed in the GitHub CI environment as part of the criu OBS package as a dependency for podman. Distributions such as Arch Linux have adopted an externally managed python installation in compliance with PEP 668 [1] that prevents pip from breaking the system by either installing packages to the system or locally in the home folder. The '--break-system-packages' [2] option allows pip to modify an externally managed Python installation. [1] https://peps.python.org/pep-0668/ [2] https://pip.pypa.io/en/stable/cli/pip_uninstall/ Signed-off-by: Radostin Stoyanov --- crit/pyproject.toml | 3 ++- crit/requirements.txt | 7 +++++++ lib/Makefile | 27 ++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 crit/requirements.txt diff --git a/crit/pyproject.toml b/crit/pyproject.toml index b1e1a4650a..019b0d8488 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -1,2 +1,3 @@ [build-system] -requires = ["setuptools"] +# Minimum requirements for the build system to execute. +requires = ["setuptools", "wheel"] # PEP 508 specifications. diff --git a/crit/requirements.txt b/crit/requirements.txt new file mode 100644 index 0000000000..c27e6d4f0b --- /dev/null +++ b/crit/requirements.txt @@ -0,0 +1,7 @@ +# We need pip version 20.1 or newer to correctly build with 'pycriu' symlink. +# - Building of local directories with pip 20.1 or newer is done in place, +# instead of a temporary location containing a copy of the directory tree. +# (https://github.com/pypa/pip/issues/7555) +pip>=20.1 +setuptools>=42.0.0 +wheel diff --git a/lib/Makefile b/lib/Makefile index 7ed73f9ab6..32d238de4d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -4,6 +4,9 @@ UAPI_HEADERS := lib/c/criu.h images/rpc.proto images/rpc.pb-c.h criu/include/ve all-y += lib-c lib-a lib-py +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES := 0 + # # C language bindings. lib/c/Makefile: ; @@ -54,9 +57,19 @@ install: lib-c lib-a lib-py lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig -ifeq ($(PYTHON),python3) +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" +else + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install -r ./crit/requirements.txt + $(Q) $(PYTHON) -m pip install --no-build-isolation --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit +endif +else $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install --upgrade --force-reinstall --prefix=$(DESTDIR)$(PREFIX) ./crit + $(Q) $(PYTHON) -m pip install -r ./crit/requirements.txt + $(Q) $(PYTHON) -m pip install --no-build-isolation --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit endif .PHONY: install @@ -69,7 +82,15 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) -ifeq ($(PYTHON),python3) +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" +else + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif +else $(E) " UNINSTALL" crit $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit endif From c794f396560bf55602220fde39e12c2a24914e4c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 21 Jun 2023 23:48:32 +0100 Subject: [PATCH 068/321] remove python-future dependency This commit removes the dependency on the __future__ module, which was used to enable Python 3 features in Python 2 code. With support for Python 2 being dropped, it is no longer necessary to maintain backward compatibility. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 4 ++-- .lgtm.yml | 1 - contrib/debian/dev-packages.lst | 1 - criu/Makefile.packages | 2 -- lib/py/cli.py | 1 - scripts/build/Dockerfile.amd-rocm | 1 - scripts/build/Dockerfile.centos8 | 1 - scripts/build/Dockerfile.hotspot-ubuntu | 2 -- scripts/build/Dockerfile.linux32.tmpl | 3 +-- scripts/build/Dockerfile.openj9-ubuntu | 1 - scripts/build/Dockerfile.tmpl | 3 +-- scripts/ci/prepare-for-fedora-rawhide.sh | 1 - scripts/ci/run-ci-tests.sh | 4 ++-- scripts/ci/vagrant.sh | 2 +- soccr/test/tcp-test.py | 3 +-- test/zdtm.py | 8 +------- 16 files changed, 9 insertions(+), 29 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 8b8212d695..e559ec772a 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is @@ -108,7 +108,7 @@ task: yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-importlib-metadata python3-junit_xml xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed diff --git a/.lgtm.yml b/.lgtm.yml index a884a53ef1..0dd49cda41 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -22,7 +22,6 @@ extraction: - "libbsd-dev" - "python3-yaml" - "libnl-route-3-dev" - - "python-future" - "gnutls-dev" configure: command: diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst index c2d1509fa1..ce45f1b7cf 100644 --- a/contrib/debian/dev-packages.lst +++ b/contrib/debian/dev-packages.lst @@ -17,4 +17,3 @@ libcap-dev libaio-dev python3-yaml libnl-route-3-dev -python-future diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 13c346f449..f436737fda 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,7 +6,6 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel -REQ-RPM-PKG-NAMES += $(PYTHON)-future REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -15,7 +14,6 @@ REQ-DEB-PKG-NAMES += libprotobuf-c-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf -REQ-DEB-PKG-NAMES += $(PYTHON)-future REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev diff --git a/lib/py/cli.py b/lib/py/cli.py index 82079c7f49..221f7be0d8 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -from __future__ import print_function import argparse import sys import json diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm index c0d181b039..c466a73d2d 100644 --- a/scripts/build/Dockerfile.amd-rocm +++ b/scripts/build/Dockerfile.amd-rocm @@ -55,7 +55,6 @@ RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-insta protobuf-compiler \ python-protobuf \ python3-minimal \ - python3-future \ python-ipaddress \ curl \ wget \ diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index 488f95d650..b065246744 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -28,7 +28,6 @@ RUN yum install -y --allowerasing \ python3-devel \ python3-flake8 \ python3-PyYAML \ - python3-future \ python3-protobuf \ python3-pip \ sudo \ diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 350102818b..0318f650f3 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -6,7 +6,6 @@ COPY scripts/ci/apt-install /bin/apt-install RUN apt-install protobuf-c-compiler \ libprotobuf-c-dev \ libaio-dev \ - python3-future \ libprotobuf-dev \ protobuf-compiler \ libcap-dev \ @@ -31,4 +30,3 @@ WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index a15038631c..13e9926424 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -21,8 +21,7 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ - python3-minimal \ - python3-future + python3-minimal COPY . /criu WORKDIR /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 23db14e8df..c2cf20a36b 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -6,7 +6,6 @@ COPY scripts/ci/apt-install /bin/apt-install RUN apt-install protobuf-c-compiler \ libprotobuf-c-dev \ libaio-dev \ - python3-future \ libprotobuf-dev \ protobuf-compiler \ libcap-dev \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index e0e72372d9..9f6b1d0960 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -30,8 +30,7 @@ RUN apt-install \ python-is-python3 \ python3-minimal \ python3-protobuf \ - python3-yaml \ - python3-future + python3-yaml COPY . /criu WORKDIR /criu diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 7c62aaaa2c..1c8a46fbfd 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -24,7 +24,6 @@ dnf install -y \ protobuf-devel \ python3-flake8 \ python3-PyYAML \ - python3-future \ python3-protobuf \ python3-junit_xml \ python3-pip \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index b45183a847..6d837fe06b 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -5,8 +5,8 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml - libperl-dev pkg-config python3-future python3-protobuf - python3-pip python3-importlib-metadata python3-junit.xml) + libperl-dev pkg-config python3-protobuf python3-pip + python3-importlib-metadata python3-junit.xml) X86_64_PKGS=(gcc-multilib) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 5cc8424423..ac4b5579d5 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -38,7 +38,7 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-flake8 python3-future python3-protobuf python3-importlib-metadata \ + protobuf-devel python3-flake8 python3-protobuf python3-importlib-metadata \ python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd diff --git a/soccr/test/tcp-test.py b/soccr/test/tcp-test.py index ff3fe29dc2..b48f532eb0 100755 --- a/soccr/test/tcp-test.py +++ b/soccr/test/tcp-test.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 -from __future__ import print_function import sys, socket import hashlib diff --git a/test/zdtm.py b/test/zdtm.py index 1ef941b4e8..b56f06ef12 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1,10 +1,4 @@ -#!/usr/bin/env python -from __future__ import ( - absolute_import, - division, - print_function, - unicode_literals -) +#!/usr/bin/env python3 import argparse import atexit From 5f4d92f995212fbc7c04505a1dfc58e7559dc2db Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 00:06:53 +0100 Subject: [PATCH 069/321] make: remove checks for python 2 binary This commit removes the checks for the Python 2 binary in the makefile and makes sure that ZDTM tests always use python3. Since support for Python 2 has been dropped, these checks are no longer needed. Signed-off-by: Radostin Stoyanov --- Documentation/Makefile | 4 +--- criu/Makefile | 2 -- criu/Makefile.packages | 5 +---- scripts/build/Dockerfile.tmpl | 1 - scripts/ci/run-ci-tests.sh | 4 ---- scripts/criu-ns | 2 +- scripts/magic-gen.py | 2 +- scripts/nmk/scripts/tools.mk | 2 +- soccr/test/Makefile | 3 +-- soccr/test/run.py | 2 +- test/check_actions.py | 2 +- test/crit-recode.py | 2 +- test/exhaustive/pipe.py | 2 +- test/exhaustive/unix.py | 2 +- test/inhfd/memfd.py.checkskip | 2 +- test/others/criu-ns/run.py | 2 +- test/others/ext-tty/run.py | 2 +- test/others/mnt-ext-dev/run.sh | 4 ++-- test/others/mounts/mounts.sh | 2 +- test/others/rpc/Makefile | 2 +- test/others/rpc/config_file.py | 2 +- test/others/rpc/errno.py | 2 +- test/others/rpc/ps_test.py | 2 +- test/others/rpc/restore-loop.py | 2 +- test/others/rpc/test.py | 2 +- test/others/rpc/version.py | 2 +- test/others/shell-job/run.py | 2 +- test/zdtm/static/cgroup_yard.hook | 2 +- test/zdtm/static/file_locks06.checkskip | 2 +- test/zdtm/static/net_lock_socket_iptables.hook | 2 +- test/zdtm/static/netns_lock_iptables.hook | 4 ++-- test/zdtm/static/socket-tcp-fin-wait1.hook | 2 +- 32 files changed, 31 insertions(+), 44 deletions(-) diff --git a/Documentation/Makefile b/Documentation/Makefile index 72bf0e8623..de0cc448dc 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -12,11 +12,9 @@ endif FOOTER := footer.txt SRC1 += crit.txt -ifeq ($(PYTHON),python3) SRC1 += criu-ns.txt -endif SRC1 += compel.txt -SRC1 += criu-amdgpu-plugin.txt +SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) diff --git a/criu/Makefile b/criu/Makefile index c6050d5826..bafdd980bb 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -145,10 +145,8 @@ install: $(obj)/criu $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts -ifeq ($(PYTHON),python3) $(E) " INSTALL " scripts/criu-ns $(Q) install -m 755 scripts/criu-ns $(DESTDIR)$(SBINDIR) -endif .PHONY: install uninstall: diff --git a/criu/Makefile.packages b/criu/Makefile.packages index f436737fda..7f6113c8f1 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -20,13 +20,10 @@ REQ-DEB-PKG-NAMES += libcap-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev -ifeq ($(PYTHON),python3) REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -else -REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml -endif + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 9f6b1d0960..9b53a76aab 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -27,7 +27,6 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ - python-is-python3 \ python3-minimal \ python3-protobuf \ python3-yaml diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 6d837fe06b..79744c7507 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -58,10 +58,6 @@ ci_prep () { scripts/ci/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" - - # zdtm uses an unversioned python binary to run the tests. - # let's point python to python3 - ln -sf /usr/bin/python3 /usr/bin/python } test_stream() { diff --git a/scripts/criu-ns b/scripts/criu-ns index 3c77b8eb49..4c032aa140 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import ctypes import ctypes.util import errno diff --git a/scripts/magic-gen.py b/scripts/magic-gen.py index 3b1f29fb52..38dff1424a 100755 --- a/scripts/magic-gen.py +++ b/scripts/magic-gen.py @@ -1,4 +1,4 @@ -#!/bin/env python2 +#!/bin/env python3 import sys diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 1681d4e909..724204a03c 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null || which python2 2>/dev/null) +FULL_PYTHON := $(shell which python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ diff --git a/soccr/test/Makefile b/soccr/test/Makefile index 4585400454..499901b0c5 100644 --- a/soccr/test/Makefile +++ b/soccr/test/Makefile @@ -21,7 +21,6 @@ tcp-conn-v6: tcp-conn-v6.c test: tcp-constructor tcp-conn tcp-conn-v6 unshare -n sh -c "ip link set up dev lo; ./tcp-conn" unshare -n sh -c "ip link set up dev lo; ./tcp-conn-v6" - python run.py ./$(RUN) + python3 run.py ./$(RUN) .PHONY: test - diff --git a/soccr/test/run.py b/soccr/test/run.py index 1ffe58a582..57c556e361 100644 --- a/soccr/test/run.py +++ b/soccr/test/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys, os import hashlib diff --git a/test/check_actions.py b/test/check_actions.py index 4973e39382..84d738dbb7 100755 --- a/test/check_actions.py +++ b/test/check_actions.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/crit-recode.py b/test/crit-recode.py index 4135681e11..f119271d8b 100755 --- a/test/crit-recode.py +++ b/test/crit-recode.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import pycriu import sys import os diff --git a/test/exhaustive/pipe.py b/test/exhaustive/pipe.py index 7f1c53d34b..afe20846a3 100755 --- a/test/exhaustive/pipe.py +++ b/test/exhaustive/pipe.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import os diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 6f72dd44b7..689b1fb3ae 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 252778969d..27e2b7b155 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import ctypes libc = ctypes.CDLL(None) diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py index 6967b46b29..9d068476fb 100755 --- a/test/others/criu-ns/run.py +++ b/test/others/criu-ns/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import fcntl import os diff --git a/test/others/ext-tty/run.py b/test/others/ext-tty/run.py index 8109033cb9..2c268a2c8f 100755 --- a/test/others/ext-tty/run.py +++ b/test/others/ext-tty/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import os, sys, time, signal, pty diff --git a/test/others/mnt-ext-dev/run.sh b/test/others/mnt-ext-dev/run.sh index 5a1f44450a..3f6163e084 100755 --- a/test/others/mnt-ext-dev/run.sh +++ b/test/others/mnt-ext-dev/run.sh @@ -2,7 +2,7 @@ set -e -x # construct root -python ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns +python3 ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop @@ -11,7 +11,7 @@ dev=`losetup --find --show zdtm.loop` mkdir -p ../../dev cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev -python ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? +python3 ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev unlink zdtm.loop exit $ret diff --git a/test/others/mounts/mounts.sh b/test/others/mounts/mounts.sh index 51ea69540d..bed156a50c 100755 --- a/test/others/mounts/mounts.sh +++ b/test/others/mounts/mounts.sh @@ -20,7 +20,7 @@ for i in `awk '{ print $2 }' < /proc/self/mounts`; do umount -l $i done -python mounts.py +python3 mounts.py kill $INMNTNS_PID while :; do sleep 10 diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index fc64f0c977..69537bb0d3 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -4,7 +4,7 @@ all: test-c rpc_pb2.py criu CFLAGS += -g -Werror -Wall -I. LDLIBS += -lprotobuf-c -PYTHON ?= python +PYTHON ?= python3 run: all @make -C .. loop diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 90c80fcaea..6cffe270d0 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import argparse import os diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index f84757efd6..b600b6d1c4 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Test criu errno import socket, os, errno diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index b51357d426..daeda49bce 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys, errno import rpc_pb2 as rpc diff --git a/test/others/rpc/restore-loop.py b/test/others/rpc/restore-loop.py index 84a2ce56d1..67110c2cf5 100755 --- a/test/others/rpc/restore-loop.py +++ b/test/others/rpc/restore-loop.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index 80f6338f45..ce8411bc60 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/version.py b/test/others/rpc/version.py index 9d7fa745b5..a18cd5b7b7 100755 --- a/test/others/rpc/version.py +++ b/test/others/rpc/version.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import rpc_pb2 as rpc diff --git a/test/others/shell-job/run.py b/test/others/shell-job/run.py index a59945d6a7..969965f00f 100755 --- a/test/others/shell-job/run.py +++ b/test/others/shell-job/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os, pty, sys, subprocess import termios, fcntl, time diff --git a/test/zdtm/static/cgroup_yard.hook b/test/zdtm/static/cgroup_yard.hook index d06bc45fde..b70bd59e94 100755 --- a/test/zdtm/static/cgroup_yard.hook +++ b/test/zdtm/static/cgroup_yard.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/zdtm/static/file_locks06.checkskip b/test/zdtm/static/file_locks06.checkskip index 06ab585216..c5039a2d23 100755 --- a/test/zdtm/static/file_locks06.checkskip +++ b/test/zdtm/static/file_locks06.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import fcntl import tempfile import struct diff --git a/test/zdtm/static/net_lock_socket_iptables.hook b/test/zdtm/static/net_lock_socket_iptables.hook index 0ee147eb2b..e9fcd73509 100755 --- a/test/zdtm/static/net_lock_socket_iptables.hook +++ b/test/zdtm/static/net_lock_socket_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import socket import time diff --git a/test/zdtm/static/netns_lock_iptables.hook b/test/zdtm/static/netns_lock_iptables.hook index e7daf8a655..b51d3c2cc2 100755 --- a/test/zdtm/static/netns_lock_iptables.hook +++ b/test/zdtm/static/netns_lock_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import socket @@ -67,7 +67,7 @@ if sys.argv[1] == "--post-start": cln, addr = srv.accept() cln.sendall(str.encode("--post-restore")) cln.close() - + # Server will be closed when zdtm sends SIGKILL if sys.argv[1] == "--pre-dump": diff --git a/test/zdtm/static/socket-tcp-fin-wait1.hook b/test/zdtm/static/socket-tcp-fin-wait1.hook index 9504557dad..9dcd089991 100755 --- a/test/zdtm/static/socket-tcp-fin-wait1.hook +++ b/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys sys.path.append("../crit") From cc5742d7073e5a991bb1eaa5fdaebbd5718b2591 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 02:54:30 +0100 Subject: [PATCH 070/321] test/criu-ns: drop python 2 compatibility This patch is replacing the set_blocking() function with os.set_blocking(). This function was introduced for compatibility with Python 2 in commit 8094df8di (criu-ns: Add tests for criu-ns script). Signed-off-by: Radostin Stoyanov --- test/others/criu-ns/run.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py index 9d068476fb..0a36438e80 100755 --- a/test/others/criu-ns/run.py +++ b/test/others/criu-ns/run.py @@ -25,19 +25,6 @@ def check_dumpdir(path=IMG_DIR): os.mkdir(path, 0o755) -def set_blocking(fd, blocking): - """Implement os.set_blocking() for compatibility with Python - versions earlier than 3.5""" - flags = fcntl.fcntl(fd, fcntl.F_GETFL) - - if blocking: - flags &= ~os.O_NONBLOCK - else: - flags |= os.O_NONBLOCK - - fcntl.fcntl(fd, fcntl.F_SETFL, flags) - - def run_task_with_own_pty(task): fd_m, fd_s = pty.openpty() @@ -55,7 +42,7 @@ def run_task_with_own_pty(task): os.close(fd_s) fd_m = os.fdopen(fd_m, "rb") - set_blocking(fd_m.fileno(), False) + os.set_blocking(fd_m.fileno(), False) while True: try: From a8cfec9dc43537e834a2d439927dfd8750476037 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 03:11:20 +0100 Subject: [PATCH 071/321] test/others: drop setup_swrk() py2 compatibility This patch removes the code introduced for compatibility with Python 2 in commits: 4c1ee3e227045fc1dc07b10ac7a538a68299693b test/other: Resolve Py3 compatibility issues 6b615ca15277fc14b52a09b4eb18314b7c6cbe75 test/others: Reuse setup_swrk() Signed-off-by: Radostin Stoyanov --- test/others/rpc/setup_swrk.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/test/others/rpc/setup_swrk.py b/test/others/rpc/setup_swrk.py index c7f84f952a..ffaa01de42 100644 --- a/test/others/rpc/setup_swrk.py +++ b/test/others/rpc/setup_swrk.py @@ -5,12 +5,6 @@ def setup_swrk(): print('Connecting to CRIU in swrk mode.') s1, s2 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) - - kwargs = {} - if sys.version_info.major == 3: - kwargs["pass_fds"] = [s1.fileno()] - - swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], **kwargs) + swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], pass_fds=[s1.fileno()]) s1.close() return swrk, s2 - From 460c4d26972ecf57d5fe9c5bbbc0ed2ded9b8bf2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 03:53:03 +0100 Subject: [PATCH 072/321] lib/py: drop python 2 compatibility This patch removes code introduced for compatibility with Python 2 in commits: bf80fee (lib: correctly handle stdin/stdout (Python 3)) b82f222 (lib: fix crit-recode fix for Python 2) Signed-off-by: Radostin Stoyanov --- lib/py/cli.py | 4 ---- lib/py/images/images.py | 11 ++--------- lib/py/images/pb2dict.py | 11 ++--------- 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/lib/py/cli.py b/lib/py/cli.py index 221f7be0d8..594035d27c 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -11,8 +11,6 @@ def inf(opts): if opts['in']: return open(opts['in'], 'rb') else: - if (sys.version_info < (3, 0)): - return sys.stdin if sys.stdin.isatty(): # If we are reading from a terminal (not a pipe) we want text input and not binary return sys.stdin @@ -28,8 +26,6 @@ def outf(opts, decode): mode = 'w+' return open(opts['out'], mode) else: - if (sys.version_info < (3, 0)): - return sys.stdout if decode: return sys.stdout return sys.stdout.buffer diff --git a/lib/py/images/images.py b/lib/py/images/images.py index a1d76e7cf2..9db506e1ee 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -42,7 +42,6 @@ import struct import os import array -import sys from . import magic from . import pb @@ -71,18 +70,12 @@ def __init__(self, magic): def decode_base64_data(data): """A helper function to decode base64 data.""" - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(data)) - else: - return base64.decodebytes(data) + return base64.decodebytes(str.encode(data)) def write_base64_data(f, data): """A helper function to write base64 encoded data to a file.""" - if (sys.version_info > (3, 0)): - f.write(base64.decodebytes(str.encode(data))) - else: - f.write(base64.decodebytes(data)) + f.write(base64.decodebytes(str.encode(data))) # Generic class to handle loading/dumping criu images entries from/to bin diff --git a/lib/py/images/pb2dict.py b/lib/py/images/pb2dict.py index 9d581c3750..c7046429e0 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/py/images/pb2dict.py @@ -3,7 +3,6 @@ import os import quopri import socket -import sys from ipaddress import IPv4Address, IPv6Address, ip_address from google.protobuf.descriptor import FieldDescriptor as FD @@ -247,17 +246,11 @@ def encode_dev(field, value): def encode_base64(value): - if (sys.version_info > (3, 0)): - return base64.encodebytes(value).decode() - else: - return base64.encodebytes(value) + return base64.encodebytes(value).decode() def decode_base64(value): - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(value)) - else: - return base64.decodebytes(value) + return base64.decodebytes(str.encode(value)) def encode_unix(value): From 75d9d6822a59ea3f839d7cb11ef879ee5d49fc52 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 04:00:39 +0100 Subject: [PATCH 073/321] zdtm: drop python 2 compatibility This patch removes the code for Python 2 compatibility introduced with commit e65c7b5 (zdtm: Replace imp module with importlib). Signed-off-by: Radostin Stoyanov --- test/zdtm.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index b56f06ef12..c6e852dc1a 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -635,14 +635,10 @@ def cleanup(): def load_module_from_file(name, path): - if sys.version_info[0] == 3 and sys.version_info[1] >= 5: - import importlib.util - spec = importlib.util.spec_from_file_location(name, path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - else: - import imp - mod = imp.load_source(name, path) + import importlib.util + spec = importlib.util.spec_from_file_location(name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) return mod From b759678e2d8a38e07b668882b1aa444e0b08b4d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 14:37:40 +0200 Subject: [PATCH 074/321] cgroup: Propagate error on cgroup mount failure. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the error to mount cgroup hierarchy a bit less noisy: Error (criu/cgroup.c:623): cg: Unable to mount cgroup2 : Invalid argument' Instead of Error (criu/cgroup.c:623): cg: Unable to mount cgroup2 : Invalid argument' Error (criu/cgroup.c:715): cg: failed walking /proc/self/fd/-1/zdtmtst for empty cgroups: No such file or directory' Signed-off-by: Michał MirosÅ‚aw --- criu/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/cgroup.c b/criu/cgroup.c index 0bf7b3818c..267a5b6b47 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -714,6 +714,8 @@ static int collect_cgroups(struct list_head *ctls) } } else { fd = open_cgroupfs(cc); + if (fd < 0) + return -1; } path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); From ce33c4926f1916ba9ad71cdd93172aca20e1ad3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 9 Dec 2022 16:02:03 +0100 Subject: [PATCH 075/321] files-reg: Debug "open file on overmounted mount" error. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Log the mount and file that were the cause of failing a dump. Signed-off-by: Michał MirosÅ‚aw --- criu/files-reg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index ed8b9c8899..5120977161 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1818,7 +1818,8 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) } if (!skip_for_shell_job && mnt_is_overmounted(mi)) { - pr_err("Open files on overmounted mounts are not supported yet\n"); + pr_err("Open files on overmounted mounts are not supported yet; mount=%d fd=%d path=%s\n", + p->mnt_id, p->fd, link->name + 1); return -1; } From b42e7af88ceb8947f37e990ea655b7b8a5a942e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:31:33 +0200 Subject: [PATCH 076/321] compel: Log the status word with "Task is still running" errors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- compel/src/lib/infect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 5aab7aa3ee..022d4ebf33 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -589,7 +589,7 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t } if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); goto err; } @@ -1398,7 +1398,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); return -1; } From cf01c325558f83a913911f53d374fb2b66db1d96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:28:22 +0200 Subject: [PATCH 077/321] sk-unix: Log both peer names when failing on an external stream unix socket. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make debugging dump failures resulting in "sk unix: Can't dump half of stream unix connection" errors easier. Signed-off-by: Michał MirosÅ‚aw --- criu/sk-unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 841152643a..fd38ee7b1c 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -878,7 +878,8 @@ static int __dump_external_socket(struct unix_sk_desc *sk, struct unix_sk_desc * if (peer->type != SOCK_DGRAM) { show_one_unix("Ext stream not supported", peer); - pr_err("Can't dump half of stream unix connection.\n"); + pr_err("Can't dump half of stream unix connection. name: %s; peer name: %s\n", + sk->name, peer->name); return -1; } From 13c08b83d611d3e068ca1438fbbeb274b44740df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 17 May 2023 20:04:20 +0200 Subject: [PATCH 078/321] soccr: Log offset when failed to restore socket's queued data. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- soccr/soccr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/soccr/soccr.c b/soccr/soccr.c index abea937033..6967835c79 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -816,7 +816,7 @@ static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) continue; } - logerr("Can't restore %d queue data (%d), want (%d:%d:%d)", queue, ret, chunk, len, max_chunk); + logerr("Can't restore %d queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); goto err; } off += ret; From dc3f4b5bb2165e094434ba5b2a007984b65fb90b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 18 May 2023 00:59:04 +0200 Subject: [PATCH 079/321] soccr: Log name of socket queue that failed to restore. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- soccr/soccr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/soccr/soccr.c b/soccr/soccr.c index 6967835c79..8e1ce1c633 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -781,7 +781,7 @@ int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsi return 0; } -static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) +static int __send_queue(struct libsoccr_sk *sk, const char *queue, char *buf, __u32 len) { int ret, err = -1, max_chunk; int off; @@ -816,7 +816,7 @@ static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) continue; } - logerr("Can't restore %d queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); + logerr("Can't restore %s queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); goto err; } off += ret; @@ -837,7 +837,7 @@ static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) return -1; } - return __send_queue(sk, queue, buf, len); + return __send_queue(sk, queue == TCP_RECV_QUEUE ? "recv" : "send", buf, len); } static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue, @@ -876,7 +876,7 @@ static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_dat * they can be restored without any tricks. */ tcp_repair_off(sk->fd); - if (__send_queue(sk, TCP_SEND_QUEUE, buf + len, ulen)) + if (__send_queue(sk, "not-sent send", buf + len, ulen)) return -3; if (tcp_repair_on(sk->fd)) return -4; From 4d67f67818b39e622d4986b0a45faa9631bba852 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 22 May 2023 14:49:47 +0200 Subject: [PATCH 080/321] log: Remove error logs for ignored or otherwise logged subprocess exits. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Errors in early restore.log for status=1 from a subprocess are confusing, esp. that they don't show what command failed. Since the result is either ignored or logged anyway, mark the calls as "can fail". Signed-off-by: Michał MirosÅ‚aw --- criu/netfilter.c | 4 ++-- criu/util.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/criu/netfilter.c b/criu/netfilter.c index 2212fd9f23..9e78dc4b03 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -48,8 +48,8 @@ void preload_netfilter_modules(void) fd = -1; pr_perror("failed to open /dev/null, using log fd for net module preload"); } - cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, 0); - cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, 0); + cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, CRS_CAN_FAIL); + cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, CRS_CAN_FAIL); close_safe(&fd); } diff --git a/criu/util.c b/criu/util.c index 744ec60325..aa73083bd7 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1566,7 +1566,7 @@ static int is_iptables_nft(char *bin) goto err; } - ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, 0); + ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, CRS_CAN_FAIL); if (ret) { pr_err("%s -V failed\n", cmd[0]); goto err; From fb149f76b139fd60ef00222f8eb0685a066625ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 22 May 2023 18:52:59 +0200 Subject: [PATCH 081/321] mount: Demote fsnotify logs for ignored failures. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make logs about inaccessible mounts warnings, as the failures are normally harmless (e.g. failure to read /dev/cgroup) and don't make the CRIU run fail. (If it happens that the fsnotify can't find a file, then to debug, full CRIU logs will be necessary anyway.) Signed-off-by: Michał MirosÅ‚aw --- criu/mount.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/criu/mount.c b/criu/mount.c index c26aaa58da..afbd242810 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -1197,8 +1197,8 @@ int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinf dev == pm->s_dev_rt) return 0; - pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, - pm->fstype->name, pm->ns_mountpoint); + pr_warn("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, + pm->fstype->name, pm->ns_mountpoint); return -1; } @@ -1239,12 +1239,16 @@ int __open_mountpoint(struct mount_info *pm) int open_mount(unsigned int s_dev) { struct mount_info *m; + int mnt_fd; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; - return __open_mountpoint(m); + mnt_fd = __open_mountpoint(m); + if (mnt_fd < 0) + pr_err("Can't open mount %#x\n", s_dev); + return mnt_fd; } /* Bind-mount a mount point in a temporary place without children */ From cf4b225ac03cc2a08c80194e64ad008db0a9f0b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:26:56 +0200 Subject: [PATCH 082/321] irmap: Reduce error log severity to warning. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These errors originate from the filesystem scanning in irmap.c and are mostly benign. Nevertheless, if they do result in a failed irmap lookup, that failed lookup is more interesting from an application perspective. Signed-off-by: Michał MirosÅ‚aw --- criu/irmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/irmap.c b/criu/irmap.c index 7b9d77bc1f..2cdc660714 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -101,7 +101,7 @@ static int irmap_update_stat(struct irmap *i) pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { - pr_perror("Can't stat %s", i->path); + pr_pwarn("Can't stat %s", i->path); return -1; } @@ -136,7 +136,7 @@ static int irmap_update_dir(struct irmap *t) pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { - pr_perror("Can't open %s", t->path); + pr_pwarn("Can't open %s", t->path); return -1; } From a4bb3f9a063cea2aede654317c53b35b608d042e Mon Sep 17 00:00:00 2001 From: Yan Evzman Date: Fri, 7 Jul 2023 00:36:41 +0300 Subject: [PATCH 083/321] kerndat: bind ipv6-socket only if ipv6 is enabled Fixes: #2222 Fixes: f1c8d38 ("kerndat: check if setsockopt IPV6_FREEBIND is supported") Signed-off-by: Yan Evzman Signed-off-by: Andrei Vagin --- criu/kerndat.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/criu/kerndat.c b/criu/kerndat.c index 597fe5d925..c742016174 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1578,6 +1578,11 @@ static int kerndat_has_ipv6_freebind(void) { int sk, val; + if (!kdat.ipv6) { + kdat.has_ipv6_freebind = false; + return 0; + } + sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (sk == -1) { pr_perror("Unable to create a ipv6 dgram socket"); From a96aa58096e7f7b56e5b3eed89f8324d94eaea56 Mon Sep 17 00:00:00 2001 From: znley Date: Fri, 7 Jul 2023 15:29:59 +0800 Subject: [PATCH 084/321] zdtm: replace NR_fstat with NR_statx NR_fstat is a deprecated syscall, some modern architectures such as riscv and loongarch64 no longer support this syscall. It is usually replaced by NR_statx. NR_statx is supported since linux 4.10. Signed-off-by: znley --- test/zdtm/static/seccomp_filter_inheritance.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/seccomp_filter_inheritance.c b/test/zdtm/static/seccomp_filter_inheritance.c index 7a86cd85ee..5afcb3f845 100644 --- a/test/zdtm/static/seccomp_filter_inheritance.c +++ b/test/zdtm/static/seccomp_filter_inheritance.c @@ -100,7 +100,7 @@ int main(int argc, char **argv) if (filter_syscall(__NR_ptrace) < 0) _exit(1); - if (filter_syscall(__NR_fstat) < 0) + if (filter_syscall(__NR_statx) < 0) _exit(1); zdtm_seccomp = 1; From 935e60d65b9435aa431fa587799e5b4e4f380ade Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 7 Jul 2023 16:33:12 -0700 Subject: [PATCH 085/321] kerndat: don't leak a socket file descriptor kerndat_has_ipv6_freebind creates a socket but doesn't close it. Signed-off-by: Andrei Vagin --- criu/kerndat.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index c742016174..4b836b5f76 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1574,9 +1574,26 @@ static int kerndat_has_nftables_concat(void) #define IPV6_FREEBIND 78 #endif +static int __kerndat_has_ipv6_freebind(int sk) +{ + int val = 1; + + if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { + if (errno == ENOPROTOOPT) { + kdat.has_ipv6_freebind = false; + return 0; + } + pr_perror("Unable to setsockopt ipv6_freebind"); + return -1; + } + + kdat.has_ipv6_freebind = true; + return 0; +} + static int kerndat_has_ipv6_freebind(void) { - int sk, val; + int sk, ret; if (!kdat.ipv6) { kdat.has_ipv6_freebind = false; @@ -1589,18 +1606,9 @@ static int kerndat_has_ipv6_freebind(void) return -1; } - val = 1; - if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { - if (errno == ENOPROTOOPT) { - kdat.has_ipv6_freebind = false; - return 0; - } - pr_perror("Unable to setsockopt ipv6_freebind"); - return -1; - } - - kdat.has_ipv6_freebind = true; - return 0; + ret = __kerndat_has_ipv6_freebind(sk); + close(sk); + return ret; } /* From 2d6f04cbd8313c075ff6423d2ae69f7e518c6fe4 Mon Sep 17 00:00:00 2001 From: Prajwal S N Date: Sat, 1 Jul 2023 13:15:36 +0530 Subject: [PATCH 086/321] ci: add workflow to ensure self-contained commits Signed-off-by: Prajwal S N --- .github/workflows/check-commits.yml | 30 +++++++++++++++++++++++++++++ scripts/ci/Makefile | 8 ++++++++ 2 files changed, 38 insertions(+) create mode 100644 .github/workflows/check-commits.yml diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml new file mode 100644 index 0000000000..be2fbd2856 --- /dev/null +++ b/.github/workflows/check-commits.yml @@ -0,0 +1,30 @@ +name: Verify self-contained commits + +on: pull_request + +# Cancel any preceding run on the pull request +concurrency: + group: commit-test-${{ github.event.pull_request.number }} + +jobs: + build: + runs-on: ubuntu-latest + # Check if pull request does not have label "not-selfcontained-ok" + if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" + steps: + - uses: actions/checkout@v3 + with: + # Needed to rebase against the base branch + fetch-depth: 0 + # Checkout pull request HEAD commit instead of merge commit + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: sudo apt-get install -y libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + - name: Configure git user details + run: | + git config --global user.email "checkpoint-restore@users.noreply.github.com" + git config --global user.name "checkpoint-restore" + - name: Configure base branch without switching current branch + run: git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + - name: Build each commit + run: git rebase ${{ github.base_ref }} -x "make -C scripts/ci check-commit" diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 30dd9ebeb8..5c45791034 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -102,5 +102,13 @@ vagrant-fedora-non-root: setup-vagrant .PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root +check-commit: + ($(MAKE) -j $$(nproc) -C ../.. && \ + echo "Commit $$(git rev-parse --short HEAD) built successfully") || \ + (echo "Build failed for $$(git rev-list -n 1 --pretty HEAD)" && \ + exit 1) + +.PHONY: check-commit + %: $(MAKE) -C ../build $@$(target-suffix) From f684719484b7ec70906d0af44144c64d0d1c19ea Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 11:23:38 +0800 Subject: [PATCH 087/321] include: add common header files for loongarch64 Signed-off-by: znley --- include/common/arch/loongarch64/asm/atomic.h | 62 +++++++++++++++++++ include/common/arch/loongarch64/asm/bitops.h | 24 +++++++ .../common/arch/loongarch64/asm/bitsperlong.h | 6 ++ include/common/arch/loongarch64/asm/linkage.h | 19 ++++++ include/common/arch/loongarch64/asm/page.h | 39 ++++++++++++ 5 files changed, 150 insertions(+) create mode 100644 include/common/arch/loongarch64/asm/atomic.h create mode 100644 include/common/arch/loongarch64/asm/bitops.h create mode 100644 include/common/arch/loongarch64/asm/bitsperlong.h create mode 100644 include/common/arch/loongarch64/asm/linkage.h create mode 100644 include/common/arch/loongarch64/asm/page.h diff --git a/include/common/arch/loongarch64/asm/atomic.h b/include/common/arch/loongarch64/asm/atomic.h new file mode 100644 index 0000000000..9017254397 --- /dev/null +++ b/include/common/arch/loongarch64/asm/atomic.h @@ -0,0 +1,62 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +#include +#include "common/compiler.h" + +typedef struct { + int counter; +} atomic_t; + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(v->counter), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add(1, v) +#define atomic_inc_return(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(ptr->counter) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/loongarch64/asm/bitops.h b/include/common/arch/loongarch64/asm/bitops.h new file mode 100644 index 0000000000..170e4f7369 --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitops.h @@ -0,0 +1,24 @@ +#ifndef _LINUX_BITOPS_H +#define _LINUX_BITOPS_H +#include "common/asm-generic/bitops.h" + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ + +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((1UL << ((nr) / BITS_PER_LONG)) - 1) +static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long res, mask; + mask = BIT_MASK(nr); + asm volatile("amor_db.d %0, %2, %1" : "=&r"(res), "+ZB"(addr[BIT_WORD(nr)]) : "r"(mask) : "memory"); + return (res & mask) != 0; +} + +#endif diff --git a/include/common/arch/loongarch64/asm/bitsperlong.h b/include/common/arch/loongarch64/asm/bitsperlong.h new file mode 100644 index 0000000000..13d06a384e --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG _LOONGARCH_SZLONG + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/loongarch64/asm/linkage.h b/include/common/arch/loongarch64/asm/linkage.h new file mode 100644 index 0000000000..448acc29fc --- /dev/null +++ b/include/common/arch/loongarch64/asm/linkage.h @@ -0,0 +1,19 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#define __ALIGN .align 2 +#define __ALIGN_STR ".align 2" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + __ALIGN; \ + .type name, @function; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/loongarch64/asm/page.h b/include/common/arch/loongarch64/asm/page.h new file mode 100644 index 0000000000..25bdbc1412 --- /dev/null +++ b/include/common/arch/loongarch64/asm/page.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned __page_size; +static unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SIZE page_size() +#define PAGE_SHIFT page_shift() +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#else /* CR_NOGLIBC */ + +extern unsigned page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_PAGE_H__ */ From 52630dbf58656a1e01a47c7191da5ce7cc927ac0 Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 09:35:40 +0000 Subject: [PATCH 088/321] compel: add loongarch64 support Signed-off-by: znley --- Makefile | 10 +- compel/Makefile | 2 +- .../plugins/include/asm/prologue.h | 35 +++ .../plugins/include/asm/syscall-types.h | 30 +++ .../loongarch64/plugins/include/features.h | 4 + .../loongarch64/plugins/std/parasite-head.S | 9 + .../plugins/std/syscalls/Makefile.syscalls | 117 ++++++++++ .../syscalls/syscall-common-loongarch-64.S | 44 ++++ .../plugins/std/syscalls/syscall_64.tbl | 121 +++++++++++ .../loongarch64/scripts/compel-pack.lds.S | 32 +++ compel/arch/loongarch64/src/lib/cpu.c | 41 ++++ .../loongarch64/src/lib/handle-elf-host.c | 22 ++ compel/arch/loongarch64/src/lib/handle-elf.c | 22 ++ .../loongarch64/src/lib/include/handle-elf.h | 8 + .../loongarch64/src/lib/include/syscall.h | 8 + .../src/lib/include/uapi/asm/breakpoints.h | 6 + .../src/lib/include/uapi/asm/cpu.h | 6 + .../src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 67 ++++++ .../src/lib/include/uapi/asm/sigframe.h | 86 ++++++++ compel/arch/loongarch64/src/lib/infect.c | 204 ++++++++++++++++++ compel/src/main.c | 3 + scripts/nmk/scripts/include.mk | 3 +- 23 files changed, 881 insertions(+), 3 deletions(-) create mode 100644 compel/arch/loongarch64/plugins/include/asm/prologue.h create mode 100644 compel/arch/loongarch64/plugins/include/asm/syscall-types.h create mode 100644 compel/arch/loongarch64/plugins/include/features.h create mode 100644 compel/arch/loongarch64/plugins/std/parasite-head.S create mode 100644 compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls create mode 100644 compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S create mode 100644 compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl create mode 100644 compel/arch/loongarch64/scripts/compel-pack.lds.S create mode 100644 compel/arch/loongarch64/src/lib/cpu.c create mode 100644 compel/arch/loongarch64/src/lib/handle-elf-host.c create mode 100644 compel/arch/loongarch64/src/lib/handle-elf.c create mode 100644 compel/arch/loongarch64/src/lib/include/handle-elf.h create mode 100644 compel/arch/loongarch64/src/lib/include/syscall.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h create mode 100644 compel/arch/loongarch64/src/lib/infect.c diff --git a/Makefile b/Makefile index a5c6c5bccf..9a297d2d83 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -80,6 +80,10 @@ ifeq ($(ARCH),mips) DEFINES := -DCONFIG_MIPS endif +ifeq ($(ARCH),loongarch64) + DEFINES := -DCONFIG_LOONGARCH64 +endif + # # CFLAGS_PIE: # @@ -122,6 +126,10 @@ ifeq ($(ARCH),mips) WARNINGS := -rdynamic endif +ifeq ($(ARCH),loongarch64) +WARNINGS := -Wno-implicit-function-declaration +endif + ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) diff --git a/compel/Makefile b/compel/Makefile index b79aee6871..78ec4826af 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -33,7 +33,7 @@ lib-y += arch/$(ARCH)/src/lib/thread_area.o endif # handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64,$(ARCH)),) +ifneq ($(filter arm aarch64 loongarch64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/loongarch64/plugins/include/asm/prologue.h b/compel/arch/loongarch64/plugins/include/asm/prologue.h new file mode 100644 index 0000000000..c19ce54d7a --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/asm/syscall-types.h b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000000..b883bd8bed --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h @@ -0,0 +1,30 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#include +/* Types for sigaction, sigprocmask syscalls */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +/* refer to arch/loongarch/include/uapi/asm/signal.h */ +#define _KNSIG 64 +#define _NSIG_BPW BITS_PER_LONG +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#define SA_RESTORER 0x04000000 + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/features.h b/compel/arch/loongarch64/plugins/include/features.h new file mode 100644 index 0000000000..b4a3cded2b --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/compel/arch/loongarch64/plugins/std/parasite-head.S b/compel/arch/loongarch64/plugins/std/parasite-head.S new file mode 100644 index 0000000000..3a960490eb --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/parasite-head.S @@ -0,0 +1,9 @@ + +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + bl parasite_service; + break 0; +END(__export_parasite_head_start) + diff --git a/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000000..0d08f34e1d --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,117 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-loongarch-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S new file mode 100644 index 0000000000..fff8944669 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S @@ -0,0 +1,44 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ +ENTRY(name); \ + addi.d $a7, $zero, opcode; \ + syscall 0; \ + jirl $r0, $r1, 0; \ +END(name) + +#ifndef AT_FDCWD +#define AT_FDCWD -100 +#endif + +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif + +ENTRY(sys_open) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_openat +END(sys_open) + +ENTRY(sys_mkdir) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_mkdirat +END(sys_mkdir) + +ENTRY(sys_rmdir) + addi.d $a2, $zero, AT_REMOVEDIR + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_unlinkat +END(sys_rmdir) + +ENTRY(__cr_restore_rt) + addi.d $a7, $zero, __NR_rt_sigreturn + syscall 0 +END(__cr_restore_rt) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 0000000000..b37a22674e --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,121 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. +# +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 32 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 39 sys_umount2 (char *name, int flags) +__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 57 sys_close (int fd) +__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 63 sys_read (int fd, void *buf, unsigned long count) +__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 92 sys_personality (unsigned int personality) +__NR_exit 93 sys_exit (unsigned long error_code) +__NR_exit_group 94 sys_exit_group (int error_code) +__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) +__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 128 sys_restart_syscall (void) +__NR_kill 129 sys_kill (long pid, int sig) +__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 139 sys_rt_sigreturn (void) +__NR_setpriority 140 sys_setpriority (int which, int who, int nice) +__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 155 sys_getpgid (pid_t pid) +__NR_setfsuid 151 sys_setfsuid (int fsuid) +__NR_setfsgid 152 sys_setfsgid (int fsgid) +__NR_getsid 156 sys_getsid (void) +__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 166 sys_umask (int mask) +__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 172 sys_getpid (void) +__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 178 sys_gettid (void) +__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 198 sys_socket (int domain, int type, int protocol) +__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 210 sys_shutdown (int sockfd, int how) +__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 214 sys_brk (void *addr) +__NR_munmap 215 sys_munmap (void *addr, unsigned long len) +__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 268 sys_setns (int fd, int nstype) +__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 282 sys_userfaultfd (int flags) +__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) +#__NR_rmdir ! sys_rmdir (const char *name) +#__NR_unlink ! sys_unlink (char *pathname) +#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) +#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) +#__NR_mkdir ! sys_mkdir (const char *name, int mode) +#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) diff --git a/compel/arch/loongarch64/scripts/compel-pack.lds.S b/compel/arch/loongarch64/scripts/compel-pack.lds.S new file mode 100644 index 0000000000..cfb7a2fb35 --- /dev/null +++ b/compel/arch/loongarch64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(loongarch) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} diff --git a/compel/arch/loongarch64/src/lib/cpu.c b/compel/arch/loongarch64/src/lib/cpu.c new file mode 100644 index 0000000000..172b90e275 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/cpu.c @@ -0,0 +1,41 @@ +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } + + return compel_test_cpu_cap(&rt_info, feature); +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf-host.c b/compel/arch/loongarch64/src/lib/handle-elf-host.c new file mode 100644 index 0000000000..a605a5a452 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf-host.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf.c b/compel/arch/loongarch64/src/lib/handle-elf.c new file mode 100644 index 0000000000..a605a5a452 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/include/handle-elf.h b/compel/arch/loongarch64/src/lib/include/handle-elf.h new file mode 100644 index 0000000000..b0a66ef879 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/handle-elf.h @@ -0,0 +1,8 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define arch_is_machine_supported(e_machine) (e_machine == EM_LOONGARCH) + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/syscall.h b/compel/arch/loongarch64/src/lib/include/syscall.h new file mode 100644 index 0000000000..ac3e2799ac --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +#ifndef SIGSTKFLT +#define SIGSTKFLT 16 +#endif + +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000000..21eb1309f2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000000..e568df789c --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; +#endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000000..7f476d541a --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000000..0b047a5b08 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,67 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * From the Linux kernel header arch/loongarch/include/uapi/asm/ptrace.h + * + * A thread LoongArch CPU context + * + * struct user_fp_state { + * uint64_t fpr[32]; + * uint64_t fcc; + * uint32_t fcsr; + * }; + * + * struct user_pt_regs { + * unsigned long regs[32]; + * unsigned long csr_era; + * unsigned long csr_badv; + * unsigned long reserved[11]; + * }; + */ + +struct user_gp_regs { + uint64_t regs[32]; + uint64_t orig_a0; + uint64_t pc; + uint64_t csr_badv; + uint64_t reserved[10]; +} __attribute__((aligned(8))); + +struct user_fp_regs { + uint64_t regs[32]; + uint64_t fcc; + uint32_t fcsr; +}; + +typedef struct user_gp_regs user_regs_struct_t; +typedef struct user_fp_regs user_fpregs_struct_t; + +#define user_regs_native(regs) true + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(r) ((uint64_t)(r).regs[4]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SP(r) ((uint64_t)(r).regs[3]) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[11]) +#define SET_REG_IP(r, val) ((r).pc = (val)) + +#define GPR_NUM 32 +#define FPR_NUM 32 + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000000..fcb545a1d2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,86 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include +#include + +#include +#include + +#include + +#define rt_sigcontext sigcontext +/* sigcontext defined in usr/include/uapi/asm/sigcontext.h*/ +#include +typedef __u32 u32; + +typedef struct sigcontext_t { + __u64 pc; + __u64 regs[32]; + __u32 flags; + __u64 extcontext[0] __attribute__((__aligned__(16))); +} sigcontext_t; + +typedef struct context_info_t { + __u32 magic; + __u32 size; + __u64 padding; +} context_info_t; + +#define FPU_CTX_MAGIC 0x46505501 +#define FPU_CTX_ALIGN 8 +typedef struct fpu_context_t { + __u64 regs[32]; + __u64 fcc; + __u64 fcsr; +} fpu_context_t; + +typedef struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + sigset_t uc_sigmask; + __u8 __unused[1024 / 8 - sizeof(sigset_t)]; + sigcontext_t uc_mcontext; +} ucontext; + +/* Copy from the kernel source arch/loongarch/kernel/signal.c */ +struct rt_sigframe { + rt_siginfo_t rs_info; + ucontext rs_uc; +}; + +#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe->rs_uc)) +#define RT_SIGFRAME_SIGMASK(rt_sigframe) ((k_rtsigset_t *)&RT_SIGFRAME_UC(rt_sigframe)->uc_sigmask) +#define RT_SIGFRAME_SIGCTX(rt_sigframe) (&(RT_SIGFRAME_UC(rt_sigframe)->uc_mcontext)) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(RT_SIGFRAME_SIGCTX(rt_sigframe)->pc)) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) + +#define RT_SIGFRAME_FPU(rt_sigframe) \ + ({ \ + context_info_t *ctx = (context_info_t *)RT_SIGFRAME_SIGCTX(rt_sigframe)->extcontext; \ + ctx->magic = FPU_CTX_MAGIC; \ + ctx->size = sizeof(context_info_t) + sizeof(fpu_context_t); \ + (fpu_context_t *)((char *)ctx + sizeof(context_info_t)); \ + }) + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "addi.d $sp, %0, 0 \n" \ + "addi.d $a7, $zero, "__stringify(__NR_rt_sigreturn)" \n" \ + "syscall 0" \ + : \ + :"r"(new_sp) \ + : "$a7", "memory") +/* clang-format on */ + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); + +#define rt_sigframe_erase_sigset(sigframe) memset(RT_SIGFRAME_SIGMASK(sigframe), 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(RT_SIGFRAME_SIGMASK(sigframe), from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c new file mode 100644 index 0000000000..8e3c19aff2 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include + +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "common/page.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" +#include "common/bug.h" + +/* + * Injected syscall instruction + * loongarch64 is Little Endian + */ +const char code_syscall[] = { + 0x00, 0x00, 0x2b, 0x00, /* syscall */ + 0x00, 0x00, 0x2a, 0x00 /* break */ +}; + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigcontext_t *sc; + fpu_context_t *fpu; + + sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, regs->regs, sizeof(regs->regs)); + sc->pc = regs->pc; + + fpu = RT_SIGFRAME_FPU(sigframe); + memcpy(fpu->regs, fpregs->regs, sizeof(fpregs->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { + pr_perror("Failed to obtain CPU registers for %d", pid); + goto err; + } + + /* + * Refer to Linux kernel arch/loongarch/kernel/signal.c + */ + if (regs->regs[0]) { + switch (regs->regs[4]) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->regs[4] = regs->orig_a0; + regs->pc -= 4; + break; + case -ERESTART_RESTARTBLOCK: + regs->regs[4] = regs->orig_a0; + regs->regs[11] = __NR_restart_syscall; + regs->pc -= 4; + break; + } + regs->regs[0] = 0; /* Don't deal with this again. */ + } + + iov.iov_base = fpregs; + iov.iov_len = sizeof(user_fpregs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + goto err; + } + + ret = save(arg, regs, fpregs); +err: + return 0; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +/* + * Registers $4 ~ $11 represents arguments a0 ~ a7, especially a7 is + * used as syscall number. + */ +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + int err; + user_regs_struct_t regs = ctl->orig.regs; + + regs.regs[11] = (unsigned long)nr; + regs.regs[4] = arg1; + regs.regs[5] = arg2; + regs.regs[6] = arg3; + regs.regs[7] = arg4; + regs.regs[8] = arg5; + regs.regs[9] = arg6; + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.regs[4]; + + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); + + if (err < 0 || IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->regs[4] = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * TODO: add feature + */ +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +/* + * Refer to Linux kernel arch/loongarch/include/asm/processor.h + */ +#define TASK_SIZE32 (1UL) << 31 +#define TASK_SIZE64_MIN (1UL) << 40 +#define TASK_SIZE64_MAX (1UL) << 48 + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + for (task_size = TASK_SIZE64_MIN; task_size < TASK_SIZE64_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} diff --git a/compel/src/main.c b/compel/src/main.c index ef05a46d01..bc16c0ab41 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -57,6 +57,9 @@ static const flags_t flags = { #elif defined CONFIG_MIPS .arch = "mips", .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_LOONGARCH64 + .arch = "loongarch64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index c1c1e94af4..55c5be307f 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -20,7 +20,8 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/ppc64.*/ppc64/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ - -e s/aarch64.*/aarch64/) + -e s/aarch64.*/aarch64/ \ + -e s/loongarch64.*/loongarch64/) export SUBARCH ARCH From 521383d1b2e831aaa0ebb012552c3b73415fb42c Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 15:09:22 +0800 Subject: [PATCH 089/321] images: add loongarch64 core image Signed-off-by: znley --- images/Makefile | 1 + images/core-loongarch64.proto | 23 +++++++++++++++++++++++ images/core.proto | 3 +++ 3 files changed, 27 insertions(+) create mode 100755 images/core-loongarch64.proto diff --git a/images/Makefile b/images/Makefile index 004e22ec3f..ca85b1a213 100644 --- a/images/Makefile +++ b/images/Makefile @@ -2,6 +2,7 @@ proto-obj-y += stats.o proto-obj-y += core.o proto-obj-y += core-x86.o proto-obj-y += core-mips.o +proto-obj-y += core-loongarch64.o proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o diff --git a/images/core-loongarch64.proto b/images/core-loongarch64.proto new file mode 100755 index 0000000000..8258f006ea --- /dev/null +++ b/images/core-loongarch64.proto @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +message user_loongarch64_gpregs_entry { + repeated uint64 regs = 1; + required uint64 pc = 2; +} + +message user_loongarch64_fpregs_entry { + repeated uint64 regs = 1; + required uint64 fcc = 2; + required uint32 fcsr = 3; +} + +message thread_info_loongarch64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_loongarch64_gpregs_entry gpregs = 3[(criu).hex = true]; + required user_loongarch64_fpregs_entry fpregs = 4[(criu).hex = true]; +} diff --git a/images/core.proto b/images/core.proto index eddd1dc555..1882fe8e42 100644 --- a/images/core.proto +++ b/images/core.proto @@ -8,6 +8,7 @@ import "core-aarch64.proto"; import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; +import "core-loongarch64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -122,6 +123,7 @@ message core_entry { PPC64 = 4; S390 = 5; MIPS = 6; + LOONGARCH64 = 7; } required march mtype = 1; @@ -131,6 +133,7 @@ message core_entry { optional thread_info_ppc64 ti_ppc64 = 9; optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; + optional thread_info_loongarch64 ti_loongarch64 = 12; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; From 95fbd7e45788a55339cf90f96a4cf9b1b1046ca7 Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 15:15:30 +0800 Subject: [PATCH 090/321] criu: add loongarch64 support to parasite and restorer Signed-off-by: znley --- criu/arch/loongarch64/Makefile | 14 +++ criu/arch/loongarch64/cpu.c | 31 +++++ criu/arch/loongarch64/crtools.c | 115 ++++++++++++++++++ criu/arch/loongarch64/include/asm/dump.h | 15 +++ criu/arch/loongarch64/include/asm/int.h | 6 + criu/arch/loongarch64/include/asm/kerndat.h | 7 ++ .../include/asm/parasite-syscall.h | 6 + criu/arch/loongarch64/include/asm/parasite.h | 11 ++ criu/arch/loongarch64/include/asm/restore.h | 33 +++++ criu/arch/loongarch64/include/asm/restorer.h | 97 +++++++++++++++ .../loongarch64/include/asm/thread_pointer.h | 27 ++++ criu/arch/loongarch64/include/asm/types.h | 39 ++++++ criu/arch/loongarch64/include/asm/vdso.h | 27 ++++ criu/arch/loongarch64/restorer.c | 14 +++ criu/arch/loongarch64/sigframe.c | 12 ++ criu/arch/loongarch64/vdso-pie.c | 48 ++++++++ 16 files changed, 502 insertions(+) create mode 100644 criu/arch/loongarch64/Makefile create mode 100644 criu/arch/loongarch64/cpu.c create mode 100644 criu/arch/loongarch64/crtools.c create mode 100644 criu/arch/loongarch64/include/asm/dump.h create mode 100644 criu/arch/loongarch64/include/asm/int.h create mode 100644 criu/arch/loongarch64/include/asm/kerndat.h create mode 100644 criu/arch/loongarch64/include/asm/parasite-syscall.h create mode 100644 criu/arch/loongarch64/include/asm/parasite.h create mode 100644 criu/arch/loongarch64/include/asm/restore.h create mode 100644 criu/arch/loongarch64/include/asm/restorer.h create mode 100644 criu/arch/loongarch64/include/asm/thread_pointer.h create mode 100644 criu/arch/loongarch64/include/asm/types.h create mode 100644 criu/arch/loongarch64/include/asm/vdso.h create mode 100644 criu/arch/loongarch64/restorer.c create mode 100644 criu/arch/loongarch64/sigframe.c create mode 100644 criu/arch/loongarch64/vdso-pie.c diff --git a/criu/arch/loongarch64/Makefile b/criu/arch/loongarch64/Makefile new file mode 100644 index 0000000000..4bd99eb7eb --- /dev/null +++ b/criu/arch/loongarch64/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +ccflags-y += -iquote $(obj)/include +ccflags-y += -iquote criu/include -iquote include +ccflags-y += $(COMPEL_UAPI_INCLUDES) + +asflags-y += -Wstrict-prototypes +asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/criu/arch/loongarch64/cpu.c b/criu/arch/loongarch64/cpu.c new file mode 100644 index 0000000000..5559c4288f --- /dev/null +++ b/criu/arch/loongarch64/cpu.c @@ -0,0 +1,31 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + return 0; +} diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c new file mode 100644 index 0000000000..eeb0731ca6 --- /dev/null +++ b/criu/arch/loongarch64/crtools.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "log.h" +#include "asm/restorer.h" +#include "asm/parasite-syscall.h" +#include +#include "asm/dump.h" +#include "cr_options.h" +#include "common/compiler.h" +#include "restorer.h" +#include "parasite-syscall.h" +#include "util.h" +#include "cpu.h" +#include +#include "kerndat.h" + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + +#define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + int i; + CoreEntry *core = x; + UserLoongarch64GpregsEntry *gprs = core->ti_loongarch64->gpregs; + UserLoongarch64FpregsEntry *fprs = core->ti_loongarch64->fpregs; + for (i = 0; i < GPR_NUM; i++) + assign_reg(gprs, regs, regs[i]); + assign_reg(gprs, regs, pc); + + for (i = 0; i < FPR_NUM; i++) + assign_reg(fpregs, fpregs, regs[i]); + assign_reg(fprs, fpregs, fcc); + assign_reg(fprs, fpregs, fcsr); + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoLoongarch64 *ti_loongarch64; + UserLoongarch64GpregsEntry *gpregs; + UserLoongarch64FpregsEntry *fpregs; + + ti_loongarch64 = xmalloc(sizeof(*ti_loongarch64)); + thread_info_loongarch64__init(ti_loongarch64); + core->ti_loongarch64 = ti_loongarch64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_loongarch64_gpregs_entry__init(gpregs); + gpregs->n_regs = GPR_NUM; + gpregs->regs = xmalloc(GPR_NUM * sizeof(uint64_t)); + if (!gpregs->regs) + goto err; + ti_loongarch64->gpregs = gpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + goto err; + user_loongarch64_fpregs_entry__init(fpregs); + fpregs->n_regs = FPR_NUM; + fpregs->regs = xmalloc(FPR_NUM * sizeof(uint64_t)); + if (!fpregs->regs) + goto err; + ti_loongarch64->fpregs = fpregs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpregs) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + fpu_context_t *fpu = RT_SIGFRAME_FPU(sigframe); + UserLoongarch64FpregsEntry *fpregs = core->ti_loongarch64->fpregs; + + memcpy(fpu->regs, fpregs->regs, sizeof(fpu->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int restore_gpregs(struct rt_sigframe *sigframe, UserRegsEntry *r) +{ + sigcontext_t *sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, r->regs, sizeof(sc->regs)); + sc->pc = r->pc; + return 0; +} diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h new file mode 100644 index 0000000000..04347155c3 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_loongarch64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/int.h b/criu/arch/loongarch64/include/asm/int.h new file mode 100644 index 0000000000..642804e9b4 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/kerndat.h b/criu/arch/loongarch64/include/asm/kerndat.h new file mode 100644 index 0000000000..bb70cf6cf5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/parasite-syscall.h b/criu/arch/loongarch64/include/asm/parasite-syscall.h new file mode 100644 index 0000000000..6008c37923 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/loongarch64/include/asm/parasite.h b/criu/arch/loongarch64/include/asm/parasite.h new file mode 100644 index 0000000000..b64cb3185c --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite.h @@ -0,0 +1,11 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm volatile("or %0, $zero, $tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/loongarch64/include/asm/restore.h b/criu/arch/loongarch64/include/asm/restore.h new file mode 100644 index 0000000000..d956231c81 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restore.h @@ -0,0 +1,33 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ +({ \ + uint64_t save_sp; \ + asm volatile("or %0, $zero, $sp" : "=r"(save_sp) : :"memory"); \ + asm volatile( \ + "or $a0, $zero, %2 \n" \ + "or $sp, $zero, %0 \n" \ + "jirl $ra, %1, 0 \n" \ + : \ + : "r"(new_sp & ~15), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "$a0", "memory"); \ + asm volatile("or $sp, $zero, %0" : : "r"(save_sp) : "memory"); \ +}) + +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_loongarch64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/loongarch64/include/asm/restorer.h b/criu/arch/loongarch64/include/asm/restorer.h new file mode 100644 index 0000000000..7a0d35c5b5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restorer.h @@ -0,0 +1,97 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include + +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld.d $a1, %2 \n" \ + "addi.d $a1, $a1, -16 \n" \ + "st.d %5, $a1, 0 \n" \ + "st.d %6, $a1, 8 \n" \ + "or $a0, $zero, %1 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone_end \n" \ + \ + "thread_run: \n" \ + "ld.d $a1, $sp, 0 \n" \ + "ld.d $a0, $sp, 8 \n" \ + "jirl $ra, $a1, 0 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "ZB"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(&clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + "or $a0, $zero, %1 \n" \ + "or $a1, $zero, %2 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone3)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, clone3_thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + "or $a0, $zero, $a3 \n" \ + "jirl $ra, $a2, 0 \n" \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") +/* clang-format on */ + +static inline void restore_tls(tls_t *ptls) +{ + asm volatile("or $tp, $zero, %0" : : "r"(*ptls)); +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +int restore_gpregs(struct rt_sigframe *f, UserLoongarch64GpregsEntry *r); +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r); + +#define arch_map_vdso(map, compat) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/thread_pointer.h b/criu/arch/loongarch64/include/asm/thread_pointer.h new file mode 100644 index 0000000000..f7e07066a5 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/loongarch64/include/asm/types.h b/criu/arch/loongarch64/include/asm/types.h new file mode 100644 index 0000000000..72bca2022b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/types.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" +#include "images/core.pb-c.h" + +#include + +#define core_is_compat(core) false + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__LOONGARCH64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_loongarch64 + +#define TI_SP(core) ((core)->ti_loongarch64->gpregs->regs[4]) + +#define TI_IP(core) ((core)->ti_loongarch64->gpregs->pc) + +typedef UserLoongarch64GpregsEntry UserRegsEntry; + +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/loongarch64/include/asm/vdso.h b/criu/arch/loongarch64/include/asm/vdso.h new file mode 100644 index 0000000000..64631dee09 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/vdso.h @@ -0,0 +1,27 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 5 +#define VDSO_SYMBOL_GTOD 3 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_getcpu"; \ + const char *aarch_vdso_symbol2 = "__vdso_clock_getres"; \ + const char *aarch_vdso_symbol3 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol4 = "__vdso_gettimeofday"; \ + const char *aarch_vdso_symbol5 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5 +#endif diff --git a/criu/arch/loongarch64/restorer.c b/criu/arch/loongarch64/restorer.c new file mode 100644 index 0000000000..730318ac14 --- /dev/null +++ b/criu/arch/loongarch64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include "log.h" +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r) +{ + return 0; +} diff --git a/criu/arch/loongarch64/sigframe.c b/criu/arch/loongarch64/sigframe.c new file mode 100644 index 0000000000..18983ff138 --- /dev/null +++ b/criu/arch/loongarch64/sigframe.c @@ -0,0 +1,12 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/loongarch64/vdso-pie.c b/criu/arch/loongarch64/vdso-pie.c new file mode 100644 index 0000000000..7a75d2741d --- /dev/null +++ b/criu/arch/loongarch64/vdso-pie.c @@ -0,0 +1,48 @@ +#include +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t pcaddi; + uint32_t ldptr; + uint32_t jirl; + uint32_t guards; + uint64_t imm64; + } __packed jmp = { + .pcaddi = 0x18000095, /* pcaddi $x, 4 */ + .ldptr = 0x260002b5, /* ldptr.d $x, $x, 0 */ + .jirl = 0x4c0002a0, /* jirl $zero, $x, 0 */ + .guards = 0x002a0000, /* break 0 */ + .imm64 = to, + }; + memcpy((void *)from, &jmp, sizeof(jmp)); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) +{ + unsigned int i; + unsigned long from, to; + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + insert_trampoline(from, to); + } + return 0; +} From c94128250ffde2f75c3f600e6d52004636d1dc63 Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 15:26:35 +0800 Subject: [PATCH 091/321] zdtm: add loongarch64 support Signed-off-by: znley --- .../lib/arch/loongarch64/include/asm/atomic.h | 49 +++++++++++++++++++ test/zdtm/lib/test.c | 2 +- 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 test/zdtm/lib/arch/loongarch64/include/asm/atomic.h diff --git a/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h new file mode 100644 index 0000000000..1803aaeb44 --- /dev/null +++ b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h @@ -0,0 +1,49 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +#define atomic_get(v) (*(volatile int *)v) +#define atomic_set(v, i) (*(v) = (i)) + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(*v), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub_return(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(*ptr) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 6291ea4a7b..a5ba38b2dd 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -406,7 +406,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); From f70c782e5c39097b8950b2d67b403ae302ff9d6b Mon Sep 17 00:00:00 2001 From: znley Date: Tue, 11 Jul 2023 15:20:00 +0800 Subject: [PATCH 092/321] ci: add workflow for loongarch64 Signed-off-by: znley --- .github/workflows/loongarch64-qemu-test.yml | 15 +++++ scripts/ci/Makefile | 5 ++ scripts/ci/loongarch64-qemu-test.sh | 69 +++++++++++++++++++++ 3 files changed, 89 insertions(+) create mode 100644 .github/workflows/loongarch64-qemu-test.yml create mode 100755 scripts/ci/loongarch64-qemu-test.sh diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml new file mode 100644 index 0000000000..ba22fa25ff --- /dev/null +++ b/.github/workflows/loongarch64-qemu-test.yml @@ -0,0 +1,15 @@ +name: LoongArch64 Qemu Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: loongarch64-qemu-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v2 + - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 5c45791034..ce844a17ce 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -110,5 +110,10 @@ check-commit: .PHONY: check-commit +loongarch64-qemu-test: + ./loongarch64-qemu-test.sh + +.PHONY: loongarch64-qemu-test + %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh new file mode 100755 index 0000000000..52e587619c --- /dev/null +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -x + +./apt-install \ + apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common \ + sshpass \ + openssh-client + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable test" + +./apt-install docker-ce + +# shellcheck source=/dev/null +. /etc/lsb-release + +# docker checkpoint and restore is an experimental feature +echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart + +docker info + +# run a loongarch64 vm + +PORT='2222' +USER='root' +PASSWORD='loongarch64' +NAME='vm' + +docker run \ + -d \ + --net host \ + --name $NAME \ + merore/archlinux-loongarch64 + +run() { + if [ -z "$1" ]; then + echo "Command cannot be empty." + exit 1 + fi + sshpass -p $PASSWORD ssh -o StrictHostKeyChecking=no -p $PORT $USER@127.0.0.1 "$1" +} + +# wait vm to start +while (! run "uname -a") +do + echo "Wait vm to start..." + sleep 1 +done +echo "The loongarch64 vm is started!" + +# Tar criu and send to vm +tar -cf criu.tar ../../../criu +sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -P $PORT criu.tar $USER@127.0.0.1:/root + +# build and test +run 'cd /root; tar -xf criu.tar' +run 'cd /root/criu; make -j4' +run "cd /root/criu; ./test/zdtm.py run -t zdtm/static/maps02 -t zdtm/static/maps05 -t zdtm/static/maps06 -t zdtm/static/maps10 -t zdtm/static/maps_file_prot -t zdtm/static/memfd00 -t zdtm/transition/fork -t zdtm/transition/fork2 -t zdtm/transition/shmem -f h" From 9f6948475a7009e99bc16688de30f9287fd7c37a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 20:46:33 +0200 Subject: [PATCH 093/321] util: Implement fchown() and fchmod() wrappers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add generic wrappers for fchown() and fchmod() that skip the calls if no changes are needed. This will allow to unify places where we can avoid errors when no-op requests are not permitted. Signed-off-by: Michał MirosÅ‚aw --- criu/include/util.h | 4 +++ criu/util.c | 83 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/criu/include/util.h b/criu/include/util.h index 4b4dfda950..7e4a13a6a8 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -263,6 +263,10 @@ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid); +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode); +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags); + int fd_has_data(int lfd); int make_yard(char *path); diff --git a/criu/util.c b/criu/util.c index aa73083bd7..bca7ad88a9 100644 --- a/criu/util.c +++ b/criu/util.c @@ -952,6 +952,89 @@ FILE *fopenat(int dirfd, char *path, char *cflags) return fdopen(tmp, cflags); } +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid) +{ + struct stat st; + + if (!fchown(fd, new_uid, new_gid)) + return 0; + if (errno != EPERM) + return -1; + + if (fstat(fd, &st) < 0) { + pr_perror("fstat() after fchown() for fd %d", fd); + goto out_eperm; + } + pr_debug("fstat(%d): uid %u gid %u\n", fd, st.st_uid, st.st_gid); + + if (new_uid != st.st_uid || new_gid != st.st_gid) + goto out_eperm; + + return 0; +out_eperm: + errno = EPERM; + return -1; +} + +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags) +{ + struct stat st; + int ret; + + if (fchownat(dirfd, path, new_uid, new_gid, flags) < 0 && errno != EPERM) { + int errno_cpy = errno; + pr_perror("Unable to change [%d]/%s ownership to (%d, %d)", + dirfd, path, new_uid, new_gid); + errno = errno_cpy; + return -1; + } + + if (fstatat(dirfd, path, &st, flags) < 0) { + int errno_cpy = errno; + pr_perror("Unable to stat [%d]/%s", dirfd, path); + errno = errno_cpy; + return -1; + } + + if (new_uid != st.st_uid || new_gid != st.st_gid) { + errno = EPERM; + pr_perror("Unable to change [%d]/%s ownership (%d, %d) to (%d, %d)", + dirfd, path, st.st_uid, st.st_gid, new_uid, new_gid); + errno = EPERM; + return -1; + } + + if (new_mode == st.st_mode) + return 0; + + if (S_ISLNK(st.st_mode)) { + /* + * We have no lchmod() function, and fchmod() will fail on + * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() + * function and flag AT_SYMLINK_NOFOLLOW described in + * man 2 fchmodat, but it is not currently implemented. %) + */ + return 0; + } + + if (!*path && flags & AT_EMPTY_PATH) + ret = fchmod(dirfd, new_mode); + else + ret = fchmodat(dirfd, path, new_mode, flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)); + if (ret < 0) { + int errno_cpy = errno; + pr_perror("Unable to set perms %o on [%d]/%s", new_mode, dirfd, path); + errno = errno_cpy; + } + + return ret; +} + +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode) +{ + return cr_fchpermat(fd, "", new_uid, new_gid, new_mode, AT_EMPTY_PATH); +} + void split(char *str, char token, char ***out, int *n) { int i; From 923e66bcdd136b0111e08c0d6c762e7e1b453348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 20:51:03 +0200 Subject: [PATCH 094/321] sk-unix: Avoid restore_file_perms() EPERM error for no-op changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: This removes the difference in calling convention of restore_file_perms() returning -errno that was the only call that did this in the caller. From: RadosÅ‚aw Burny Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/sk-unix.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index fd38ee7b1c..70ca16be4a 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -1431,32 +1431,22 @@ static int post_open_standalone(struct file_desc *d, int fd) static int restore_file_perms(struct unix_sk_info *ui) { - if (ui->ue->file_perms) { - FilePermsEntry *perms = ui->ue->file_perms; - char fname[PATH_MAX]; + FilePermsEntry *perms = ui->ue->file_perms; + char fname[PATH_MAX]; - if (ui->ue->name.len >= sizeof(fname)) { - pr_err("The file name is too long\n"); - return -E2BIG; - } - - memcpy(fname, ui->name, ui->ue->name.len); - fname[ui->ue->name.len] = '\0'; - - if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file owner and group"); - return -errno_cpy; - } + if (!perms) + return 0; - if (fchmodat(AT_FDCWD, fname, perms->mode, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file mode bits"); - return -errno_cpy; - } + if (ui->ue->name.len >= sizeof(fname)) { + pr_err("The file name is too long\n"); + errno = -E2BIG; + return -1; } - return 0; + memcpy(fname, ui->name, ui->ue->name.len); + fname[ui->ue->name.len] = '\0'; + + return cr_fchpermat(AT_FDCWD, fname, perms->uid, perms->gid, perms->mode, 0); } static int keep_deleted(struct unix_sk_info *ui) From 5f214bc7d58e94798891043154046f17feb247d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 20:58:00 +0200 Subject: [PATCH 095/321] files-reg: Avoid EPERM in ghost_apply_metadata() for no-op changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/files-reg.c | 46 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index 5120977161..50dcbc4386 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -407,46 +407,24 @@ static int mklnk_ghost(char *path, GhostFileEntry *gfe) static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; - int ret = -1; - if (S_ISLNK(gfe->mode)) { - if (lchown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (cr_fchpermat(AT_FDCWD, path, gfe->uid, gfe->gid, gfe->mode, AT_SYMLINK_NOFOLLOW) < 0) + return -1; - /* - * We have no lchmod() function, and fchmod() will fail on - * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() - * function and flag AT_SYMLINK_NOFOLLOW described in - * man 2 fchmodat, but it is not currently implemented. %) - */ - } else { - if (chown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (!gfe->atim) + return 0; - if (chmod(path, gfe->mode)) { - pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); - goto err; - } - } + tv[0].tv_sec = gfe->atim->tv_sec; + tv[0].tv_usec = gfe->atim->tv_usec; + tv[1].tv_sec = gfe->mtim->tv_sec; + tv[1].tv_usec = gfe->mtim->tv_usec; - if (gfe->atim) { - tv[0].tv_sec = gfe->atim->tv_sec; - tv[0].tv_usec = gfe->atim->tv_usec; - tv[1].tv_sec = gfe->mtim->tv_sec; - tv[1].tv_usec = gfe->mtim->tv_usec; - if (lutimes(path, tv)) { - pr_perror("Can't set access and modification times on ghost %s", path); - goto err; - } + if (lutimes(path, tv)) { + pr_perror("Can't set access and modification times on ghost %s", path); + return -1; } - ret = 0; -err: - return ret; + return 0; } static int create_ghost_dentry(char *path, GhostFileEntry *gfe, struct cr_img *img) From 6e2312577bac405bd43c0153aac26451bf665f0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 21:01:29 +0200 Subject: [PATCH 096/321] cgroup: Replace restore_perms() with cr_fchperm(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- criu/cgroup.c | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 267a5b6b47..67282f269e 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1337,34 +1337,6 @@ void fini_cgroup(void) cg_yard = NULL; } -static int restore_perms(int fd, const char *path, CgroupPerms *perms) -{ - struct stat sb; - - if (perms) { - if (fstat(fd, &sb) < 0) { - pr_perror("stat of property %s failed", path); - return -1; - } - - /* only chmod/chown if the perms are actually different: we aren't - * allowed to chmod some cgroup props (e.g. the read only ones), so we - * don't want to try if the perms already match. - */ - if (sb.st_mode != (mode_t)perms->mode && fchmod(fd, perms->mode) < 0) { - pr_perror("chmod of %s failed", path); - return -1; - } - - if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && fchown(fd, perms->uid, perms->gid)) { - pr_perror("chown of %s failed", path); - return -1; - } - } - - return 0; -} - static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) { char *current, *next; @@ -1462,7 +1434,7 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat return -1; } - if (restore_perms(fd, path, perms) < 0) + if (perms && cr_fchperm(fd, perms->uid, perms->gid, perms->mode) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ @@ -1786,7 +1758,7 @@ static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { - int fd, ret; + int fd, ret = 0; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { @@ -1794,7 +1766,8 @@ static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) return -1; } - ret = restore_perms(fd, path, perms); + if (perms) + ret = cr_fchperm(fd, perms->uid, perms->gid, perms->mode); close(fd); return ret; } From 00d061b58dabf1aca512e1934d0667fd4732dee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 21:02:43 +0200 Subject: [PATCH 097/321] memfd: Avoid EPERM for no-op chown(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/memfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/memfd.c b/criu/memfd.c index 6a43dece60..1b4278a7d3 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -279,7 +279,7 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; - if (fchown(fd, mie->uid, mie->gid)) { + if (cr_fchown(fd, mie->uid, mie->gid)) { pr_perror("Can't change uid %d gid %d of memfd:%s", (int)mie->uid, (int)mie->gid, mie->name); goto out; } From e90fbd72f73b486957890c9ddf1828ea1827611a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 21:03:02 +0200 Subject: [PATCH 098/321] tty: Avoid EPERM for no-op chown(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/tty.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/tty.c b/criu/tty.c index 199984ec08..9faf602f20 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -22,6 +22,7 @@ #include "rst-malloc.h" #include "log.h" #include "common/list.h" +#include "util.h" #include "util-pie.h" #include "proc_parse.h" #include "file-ids.h" @@ -867,7 +868,7 @@ static int restore_tty_params(int fd, struct tty_info *info) } if (info->tie->has_uid && info->tie->has_gid) { - if (fchown(fd, info->tie->uid, info->tie->gid)) { + if (cr_fchown(fd, info->tie->uid, info->tie->gid)) { pr_perror("Can't setup uid %d gid %d on %#x", (int)info->tie->uid, (int)info->tie->gid, info->tfe->id); return -1; From b9f360b15fc4883af2db611125488144f61f368e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Wed, 21 Jun 2023 22:42:09 +0200 Subject: [PATCH 099/321] restore: Avoid need for CAP_SETPCAP if not changing uids. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CRIU is run with the task's credentials on restore, don't set uids and gids. This avoids the need to modify the SECURE_NO_SETUID_FIXUP flag which requires CAP_SETPCAP. From: Andy Tucker Signed-off-by: Michał MirosÅ‚aw --- criu/pie/restorer.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0d1360c52b..9d1facf8ad 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -191,10 +191,8 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ int b, i, ret; struct cap_header hdr; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; - - /* - * We're still root here and thus can do it without failures. - */ + int ruid, euid, suid, fsuid; + int rgid, egid, sgid, fsgid; /* * Setup supplementary group IDs early. @@ -207,6 +205,18 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ } } + /* + * Compare xids with current values. If all match then we can skip + * setting them (which requires extra capabilities). + */ + fsuid = sys_setfsuid(-1); + fsgid = sys_setfsgid(-1); + if (sys_getresuid(&ruid, &euid, &suid) == 0 && sys_getresgid(&rgid, &egid, &sgid) == 0 && ruid == ce->uid && + euid == ce->euid && suid == ce->suid && rgid == ce->gid && egid == ce->egid && sgid == ce->sgid && + fsuid == ce->fsuid && fsgid == ce->fsgid) { + goto skip_xids; + } + /* * First -- set the SECURE_NO_SETUID_FIXUP bit not to * lose caps bits when changing xids. @@ -250,12 +260,13 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ return -1; } +skip_xids: /* * Third -- restore securebits. We don't need them in any * special state any longer. */ - if (!uid) { + if (sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0) != ce->secbits) { ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); From 53dd6ba74c4b8fed95d9c2292aae191b12c3977a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Wed, 21 Jun 2023 22:42:44 +0200 Subject: [PATCH 100/321] restore: Skip setgroups() when already correct. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip calling setgroups() when the list of auxiliary groups already has the values we want. This allows restoring into an unprivileged user namespace where setgroups() is disabled. From: Ambrose Feinstein Signed-off-by: Michał MirosÅ‚aw --- criu/pie/restorer.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9d1facf8ad..a0f3eb90b4 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -51,6 +51,11 @@ #include "shmem.h" #include "restorer.h" +/* + * sys_getgroups() buffer size. Not too much, to avoid stack overflow. + */ +#define MAX_GETGROUPS_CHECKED (512 / sizeof(unsigned int)) + #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 #endif @@ -198,10 +203,19 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * Setup supplementary group IDs early. */ if (args->groups) { - ret = sys_setgroups(ce->n_groups, args->groups); - if (ret) { - pr_err("Can't setup supplementary group IDs: %d\n", ret); - return -1; + /* + * We may be in an unprivileged user namespace where setgroups + * is disabled. If the current list of groups is already what + * we want, skip the call to setgroups. + */ + unsigned int gids[MAX_GETGROUPS_CHECKED]; + int n = sys_getgroups(MAX_GETGROUPS_CHECKED, gids); + if (n != ce->n_groups || memcmp(gids, args->groups, n * sizeof(*gids))) { + ret = sys_setgroups(ce->n_groups, args->groups); + if (ret) { + pr_err("Can't setgroups([%zu gids]): %d\n", ce->n_groups, ret); + return -1; + } } } From ff67ad84e4d6bd3e6405a298e557ee562a28f238 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 14:42:48 +0200 Subject: [PATCH 101/321] restore: Fix capability migration requirements between different kernels. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When restoring on a kernel that has different number of supported capabilities than checkpoint one, check that the extra caps are unset. There are two directions to consider: 1) dump.cap_last_cap > restore.cap_last_cap - restoring might reduce the processes' capabilities if restored kernel doesn't support checkpointed caps. Warn. 2) dump.cap_last_cap < restore.cap_last_cap - restoring will fill the extra caps with zeroes. No changes. Note: `last_cap` might change without affecting `n_words`. Signed-off-by: Michał MirosÅ‚aw --- criu/cr-restore.c | 40 ++++++++++++++++++++++++---------------- criu/kerndat.c | 9 ++++++++- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index bff41dc565..9107a23226 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2940,12 +2940,6 @@ static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry return ret; } -static inline int verify_cap_size(CredsEntry *ce) -{ - return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && (ce->n_cap_prm == CR_CAP_SIZE) && - (ce->n_cap_bnd == CR_CAP_SIZE)); -} - static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; @@ -3360,17 +3354,31 @@ static bool groups_match(gid_t *groups, int n_groups) return ret; } +static void copy_caps(u32 *out_caps, u32 *in_caps, int n_words) +{ + int i, cap_end; + + for (i = kdat.last_cap + 1; i < 32 * n_words; ++i) { + if (~in_caps[i / 32] & (1 << (i % 32))) + continue; + + pr_warn("Dropping unsupported capability %d > %d)\n", i, kdat.last_cap); + /* extra caps will be cleared below */ + } + + n_words = min(n_words, (kdat.last_cap + 31) / 32); + cap_end = (kdat.last_cap & 31) + 1; + memcpy(out_caps, in_caps, sizeof(*out_caps) * n_words); + if ((cap_end & 31) && n_words) + out_caps[n_words - 1] &= (1 << cap_end) - 1; + memset(out_caps + n_words, 0, sizeof(*out_caps) * (CR_CAP_SIZE - n_words)); +} + static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; - if (!verify_cap_size(ce)) { - pr_err("Caps size mismatch %d %d %d %d\n", (int)ce->n_cap_inh, (int)ce->n_cap_eff, (int)ce->n_cap_prm, - (int)ce->n_cap_bnd); - return ERR_PTR(-EINVAL); - } - this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); @@ -3458,10 +3466,10 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.groups = NULL; args->creds.lsm_profile = NULL; - memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); - memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); - memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); - memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); + copy_caps(args->cap_inh, ce->cap_inh, ce->n_cap_inh); + copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); + copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); + copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; diff --git a/criu/kerndat.c b/criu/kerndat.c index 4b836b5f76..bd1ccdc7d1 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -465,8 +465,15 @@ static int get_last_cap(void) struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; + int ret; + + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + if (ret || kdat.last_cap < 32 * CR_CAP_SIZE) + return ret; - return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + pr_err("Kernel reports more capabilities than this CRIU supports: %u > %u\n", + kdat.last_cap, 32 * CR_CAP_SIZE - 1); + return -1; } static bool kerndat_has_memfd_create(void) From 6bad5d2cdbf2f88aad0416da47649508276c8a84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 18:20:24 +0200 Subject: [PATCH 102/321] prctl: Migrate prctl(NO_NEW_PRIVS) setting. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- criu/include/parasite.h | 1 + criu/include/prctl.h | 6 ++++++ criu/parasite-syscall.c | 4 ++++ criu/pie/parasite.c | 1 + criu/pie/restorer.c | 8 ++++++++ images/creds.proto | 1 + 6 files changed, 21 insertions(+) diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 787c927be9..739fbf2c37 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -148,6 +148,7 @@ struct parasite_dump_creds { int uids[4]; int gids[4]; + int no_new_privs; unsigned int secbits; unsigned int ngroups; /* diff --git a/criu/include/prctl.h b/criu/include/prctl.h index c843f40a75..4c2a548b16 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -30,6 +30,12 @@ #ifndef PR_SET_DUMPABLE #define PR_SET_DUMPABLE 4 #endif +#ifndef PR_GET_NO_NEW_PRIVS +#define PR_GET_NO_NEW_PRIVS 39 +#endif +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif #ifndef PR_SET_MM #define PR_SET_MM 35 diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 35489634d9..c08ed09b18 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -115,6 +115,10 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + if (c->no_new_privs > 0) { + ce->no_new_privs = c->no_new_privs; + ce->has_no_new_privs = true; + } ce->secbits = c->secbits; ce->n_groups = c->ngroups; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 2303f41c39..58ea35892e 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -268,6 +268,7 @@ static int dump_creds(struct parasite_dump_creds *args) } } + args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); ret = sys_getgroups(0, NULL); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index a0f3eb90b4..c3662b30b6 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -345,6 +345,14 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ if (lsm_set_label(args->lsm_sockcreate, "sockcreate", procfd) < 0) return -1; + if (ce->has_no_new_privs && ce->no_new_privs) { + ret = sys_prctl(PR_SET_NO_NEW_PRIVS, ce->no_new_privs, 0, 0, 0); + if (ret) { + pr_err("Unable to set no_new_privs=%d: %d\n", ce->no_new_privs, ret); + return -1; + } + } + return 0; } diff --git a/images/creds.proto b/images/creds.proto index 6228f7fcbb..220ed38587 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -24,4 +24,5 @@ message creds_entry { optional string lsm_profile = 15; optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; + optional uint32 no_new_privs = 18; } From d4902182b0bf9b69c11e4159110bba6a65a846ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Wed, 19 Jul 2023 18:57:09 +0200 Subject: [PATCH 103/321] prctl: test prctl(NO_NEW_PRIVS) setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- test/zdtm/static/Makefile | 1 + test/zdtm/static/seccomp_no_new_privs.c | 42 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 test/zdtm/static/seccomp_no_new_privs.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 4b3d2e3418..30429e425a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -215,6 +215,7 @@ TST_NOFILE := \ seccomp_filter_tsync \ seccomp_filter_threads \ seccomp_filter_inheritance \ + seccomp_no_new_privs \ different_creds \ vsx \ bridge \ diff --git a/test/zdtm/static/seccomp_no_new_privs.c b/test/zdtm/static/seccomp_no_new_privs.c new file mode 100644 index 0000000000..95f9501ed4 --- /dev/null +++ b/test/zdtm/static/seccomp_no_new_privs.c @@ -0,0 +1,42 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that NO_NEW_PRIVS attribute is restored"; +const char *test_author = "MichaÅ‚ MirosÅ‚aw "; + +int main(int argc, char **argv) +{ + int ret; + + test_init(argc, argv); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 0) + fail("initial NO_NEW_PRIVS = %d != 0", ret); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (ret) { + pr_perror("Can't set NO_NEW_PRIVS attribute"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 1) + fail("restored NO_NEW_PRIVS = %d != 1", ret); + + pass(); + return 0; +} From 988a5f4816d5bf651f9cdfa8b85359da18a77a7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Tue, 25 Jul 2023 17:54:26 +0200 Subject: [PATCH 104/321] restore: Skip dropping BSET capability if irrelevant. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit prctl(NO_NEW_PRIVS) when set prevents child processes gaining capabilities not in permitted set. In this case, inability to clear capability from BSET that is not in the permitted set is harmless. Signed-off-by: Michał MirosÅ‚aw --- criu/pie/restorer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index c3662b30b6..d4f77bfdee 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -301,10 +301,18 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ /* already set */ continue; ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); - if (ret) { + if (!ret) + continue; + if (!ce->has_no_new_privs || !ce->no_new_privs || args->cap_prm[b] & (1 << i)) { pr_err("Unable to drop capability %d: %d\n", i + b * 32, ret); return -1; } + /* + * If prctl(NO_NEW_PRIVS) is going to be set then it + * will prevent inheriting the capabilities not in + * the permitted set. + */ + pr_warn("Unable to drop capability %d from bset: %d (but NO_NEW_PRIVS will drop it)\n", i + b * 32, ret); } } From cc500d9967441d0eb6b63b365eb3add9f0dc0a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 22 May 2023 17:41:56 +0200 Subject: [PATCH 105/321] sk-inet: Extend 'TCP repair off' failure log. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include the file descriptor and error code in the debug message to make it more useful. Fixes: e7ba90955ce7 (2016-03-14 "cr-check: Inspect errno on syscall failures") Signed-off-by: Michał MirosÅ‚aw --- criu/include/sk-inet.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 961d711ee7..b3a70fb27e 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -69,6 +69,7 @@ extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC #define setsockopt sys_setsockopt +#define pr_perror(fmt, ...) pr_err(fmt ": errno %d\n", ##__VA_ARGS__, -ret) #endif static inline void tcp_repair_off(int fd) { @@ -76,7 +77,7 @@ static inline void tcp_repair_off(int fd) ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) - pr_err("Failed to turn off repair mode on socket\n"); + pr_perror("Failed to turn off repair mode on socket %d", fd); } extern void tcp_locked_conn_add(struct inet_sk_info *); From f2d9672e5ece7607ae7d1cef2ec9edc244dccab1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 27 Jul 2023 21:21:41 +0200 Subject: [PATCH 106/321] memfd: dump and restore permissions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memfd is created by default with +x permissions set. This can be changed by a process using fchmod() and expected to prevent using this fd for exec(). Migrate the permissions. Signed-off-by: Michał MirosÅ‚aw --- criu/memfd.c | 11 +++++++++-- images/memfd.proto | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/criu/memfd.c b/criu/memfd.c index 1b4278a7d3..2158b67206 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -91,6 +91,8 @@ static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char * mie.has_hugetlb_flag = true; mie.hugetlb_flag = flag | MFD_HUGETLB; } + mie.mode = st->st_mode; + mie.has_mode = true; mie.seals = fcntl(fd, F_GET_SEALS); if (mie.seals == -1) { @@ -279,8 +281,13 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; - if (cr_fchown(fd, mie->uid, mie->gid)) { - pr_perror("Can't change uid %d gid %d of memfd:%s", (int)mie->uid, (int)mie->gid, mie->name); + if (mie->has_mode) + ret = cr_fchperm(fd, mie->uid, mie->gid, mie->mode); + else + ret = cr_fchown(fd, mie->uid, mie->gid); + if (ret) { + pr_perror("Can't set permissions { uid %d gid %d mode %#o } of memfd:%s", (int)mie->uid, + (int)mie->gid, mie->has_mode ? (int)mie->mode : -1, mie->name); goto out; } diff --git a/images/memfd.proto b/images/memfd.proto index 0e625416a7..bb0be4a6fc 100644 --- a/images/memfd.proto +++ b/images/memfd.proto @@ -22,4 +22,5 @@ message memfd_inode_entry { required uint32 seals = 6 [(criu).flags = "seals.flags"]; required uint64 inode_id = 7; optional uint32 hugetlb_flag = 8; + optional uint32 mode = 9; }; From 88249fe5269bbd2df58d8109b5a485ce10d9f81f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 27 Jul 2023 21:30:26 +0200 Subject: [PATCH 107/321] zdtm/memfd00: test memfd file mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- test/zdtm/static/memfd00.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/zdtm/static/memfd00.c b/test/zdtm/static/memfd00.c index d037f69697..8d77ed06eb 100644 --- a/test/zdtm/static/memfd00.c +++ b/test/zdtm/static/memfd00.c @@ -30,8 +30,10 @@ int main(int argc, char *argv[]) { int fd, fl_flags1, fl_flags2, fd_flags1, fd_flags2; struct statfs statfs1, statfs2; + struct stat stat; off_t pos1, pos2; char buf[5]; + int fmode1, fmode2; test_init(argc, argv); @@ -58,6 +60,13 @@ int main(int argc, char *argv[]) if (lseek(fd, pos1, SEEK_SET) < 0) err(1, "seek error"); + if (fchmod(fd, 0642)) + err(1, "Can't set permission bits"); + + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode1 = stat.st_mode; + test_daemon(); test_waitsig(); @@ -85,6 +94,15 @@ int main(int argc, char *argv[]) return 1; } + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode2 = stat.st_mode; + + if (fmode1 != fmode2) { + fail("stat.st_mode = %#o != %#o", fmode2, fmode1); + return 1; + } + pos2 = lseek(fd, 0, SEEK_CUR); if (pos1 != pos2) { fail("position differs"); From 91186014495f306391405ac4cd4f6805922b9c14 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 23:00:07 -0700 Subject: [PATCH 108/321] apparmor: fix incorrect usage of sizeof on char ptr In criu/apparmor.c: write_aa_policy(), the arg path is passed as a char pointer. The original code used sizeof(path) to get the size of it, which is incorrect as it always return the size of the char pointer (typically 8 or 4), not the actual capacity of the char array. Given that this function is only invoked with path declared as `char path[PATH_MAX]`, replacing sizeof(path) with PATH_MAX should correctly represent the maximum size of it. Fixes: 8723e3f ("check: add a feature test for apparmor_stacking") Signed-off-by: Haorong Lu --- criu/apparmor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/apparmor.c b/criu/apparmor.c index 9de54ce40b..5b62759e23 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -551,8 +551,8 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit goto fail; } - ret = snprintf(path + offset + my_offset, sizeof(path) - offset - my_offset, "/.replace"); - if (ret < 0 || ret >= sizeof(path) - offset - my_offset) { + ret = snprintf(path + offset + my_offset, PATH_MAX - offset - my_offset, "/.replace"); + if (ret < 0 || ret >= PATH_MAX - offset - my_offset) { pr_err("snprintf failed\n"); goto fail; } From 1db922f1704e9ecfd5d8fcc898bd45fb150b192a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Mon, 31 Jul 2023 20:49:05 +0200 Subject: [PATCH 109/321] page-xfer: Pull tcp_cork,nodelay(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move tcp_cork() and tcp_nodelay() to the only user: page-xfer.c. While at it, fix error messages (as they do not refer to restoring the sockopt values) and demote them as they are not fatal to the page transfer. Signed-off-by: Michał MirosÅ‚aw --- criu/include/util.h | 2 -- criu/page-xfer.c | 15 +++++++++++++++ criu/util.c | 15 --------------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/criu/include/util.h b/criu/include/util.h index 7e4a13a6a8..4334e69c2d 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -278,8 +278,6 @@ static inline int sk_wait_data(int sk) } void fd_set_nonblocking(int fd, bool on); -void tcp_nodelay(int sk, bool on); -void tcp_cork(int sk, bool on); const char *ns_to_string(unsigned int ns); diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 782d4cafce..94f4774148 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -157,6 +158,20 @@ static inline int send_psi(int sk, struct page_server_iov *pi) return send_psi_flags(sk, pi, 0); } +static void tcp_cork(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_CORK=%d", val); +} + +static void tcp_nodelay(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_NODELAY=%d", val); +} + /* page-server xfer */ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long len) { diff --git a/criu/util.c b/criu/util.c index bca7ad88a9..993ab97bb8 100644 --- a/criu/util.c +++ b/criu/util.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -1155,20 +1154,6 @@ const char *ns_to_string(unsigned int ns) } } -void tcp_cork(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) - pr_pwarn("Unable to restore TCP_CORK (%d)", val); -} - -void tcp_nodelay(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) - pr_pwarn("Unable to restore TCP_NODELAY (%d)", val); -} - static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned short port) { memset(addr, 0, sizeof(*addr)); From 72494ed608b84c31ba01d6c2a703148d27701c75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Tue, 1 Aug 2023 22:39:59 +0200 Subject: [PATCH 110/321] irmap: scan user-provided paths in order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the scan use the order of paths that came from the user. Fixes: 4f2e4ab3be01 ("irmap: add --irmap-scan-path option"; 2015-09-16) Signed-off-by: Michał MirosÅ‚aw --- criu/irmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/irmap.c b/criu/irmap.c index 2cdc660714..e12df5cb58 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -501,6 +501,6 @@ int irmap_scan_path_add(char *path) o->ir->path = path; o->ir->nr_kids = -1; - list_add(&o->node, &opts.irmap_scan_paths); + list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; } From 242de4e72e9c92203149efbecf06aa88d959a6ff Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 7 Aug 2023 19:28:07 +0100 Subject: [PATCH 111/321] amdgpu_plugin: remove duplicated log prefix The log prefix "amdgpu_plugin:" is defined with `LOG_PREFIX` in `amdgpu_plugin.c`. However, the prefix is also included in each log message. As a result it appears duplicated in the log messages: (00.044324) amdgpu_plugin: amdgpu_plugin: devices:1 bos:58 objects:148 priv_data:45696 (00.045376) amdgpu_plugin: amdgpu_plugin: Thread[0x5589] started (00.167172) amdgpu_plugin: amdgpu_plugin: img_path = amdgpu-kfd-62.img (00.083739) amdgpu_plugin: amdgpu_plugin : amdgpu_plugin_dump_file() called for fd = 235 Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 96 +++++++++++++++++----------------- 1 file changed, 47 insertions(+), 49 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 0a55e34a2b..6397ecdb74 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -451,7 +451,7 @@ void getenv_bool(const char *var, bool *value) int amdgpu_plugin_init(int stage) { - pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); topology_init(&dest_topology); @@ -481,7 +481,7 @@ int amdgpu_plugin_init(int stage) void amdgpu_plugin_fini(int stage, int ret) { - pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) sys_close_drm_render_devices(&dest_topology); @@ -513,7 +513,7 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) char img_path[128]; int ret = 0; - pr_debug("amdgpu_plugin: Enter %s\n", __func__); + pr_debug("Enter %s\n", __func__); ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { pr_perror("stat error for /dev/kfd"); @@ -539,7 +539,7 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) return 0; } - pr_perror("amdgpu_plugin: Can't handle the VMA mapping"); + pr_perror("Can't handle the VMA mapping"); return -ENOTSUP; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -857,7 +857,7 @@ void *dump_bo_contents(void *_thread_data) void *buffer; char img_path[40]; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -922,7 +922,7 @@ void *dump_bo_contents(void *_thread_data) } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -951,7 +951,7 @@ void *restore_bo_contents(void *_thread_data) int num_bos = 0; int i, ret = 0; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -989,8 +989,7 @@ void *restore_bo_contents(void *_thread_data) } if (total_bo_size != image_size) { - pr_err("amdgpu_plugin: %s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, - total_bo_size); + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, total_bo_size); ret = -EINVAL; goto exit; @@ -1026,7 +1025,7 @@ void *restore_bo_contents(void *_thread_data) } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -1054,9 +1053,9 @@ int check_hsakmt_shared_mem(uint64_t *shared_mem_size, uint32_t *shared_mem_magi /* First 4 bytes of shared file is the magic */ ret = read_file(HSAKMT_SHM_PATH, shared_mem_magic, sizeof(*shared_mem_magic)); if (ret) - pr_perror("amdgpu_plugin: Failed to read shared mem magic"); + pr_perror("Failed to read shared mem magic"); else - plugin_log_msg("amdgpu_plugin: Shared mem magic:0x%x\n", *shared_mem_magic); + plugin_log_msg("Shared mem magic:0x%x\n", *shared_mem_magic); return 0; } @@ -1071,7 +1070,7 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; if (!stat(HSAKMT_SHM_PATH, &st)) { - pr_debug("amdgpu_plugin: %s already exists\n", HSAKMT_SHM_PATH); + pr_debug("%s already exists\n", HSAKMT_SHM_PATH); } else { pr_info("Warning:%s was missing. Re-creating new file but we may lose perf counters\n", HSAKMT_SHM_PATH); @@ -1079,14 +1078,14 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha ret = ftruncate(fd, shared_mem_size); if (ret < 0) { - pr_err("amdgpu_plugin: Failed to truncate shared mem %s\n", HSAKMT_SHM); + pr_err("Failed to truncate shared mem %s\n", HSAKMT_SHM); close(fd); return -errno; } ret = write(fd, &shared_mem_magic, sizeof(shared_mem_magic)); if (ret != sizeof(shared_mem_magic)) { - pr_perror("amdgpu_plugin: Failed to restore shared mem magic"); + pr_perror("Failed to restore shared mem magic"); close(fd); return -errno; } @@ -1112,7 +1111,7 @@ static int unpause_process(int fd) ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to unpause process"); + pr_perror("Failed to unpause process"); goto exit; } @@ -1254,7 +1253,7 @@ bool kernel_supports_criu(int fd) } if (kmtIoctl(fd, AMDKFD_IOC_GET_VERSION, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call get version ioctl"); + pr_perror("Failed to call get version ioctl"); ret = false; goto exit; } @@ -1262,8 +1261,8 @@ bool kernel_supports_criu(int fd) pr_debug("Kernel IOCTL version:%d.%02d\n", args.major_version, args.minor_version); if (args.major_version != KFD_IOCTL_MAJOR_VERSION || args.minor_version < MIN_KFD_IOCTL_MINOR_VERSION) { - pr_err("amdgpu_plugin: CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", - args.major_version, args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); + pr_err("CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", args.major_version, + args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); ret = false; goto exit; } @@ -1286,13 +1285,13 @@ int amdgpu_plugin_dump_file(int fd, int id) size_t len; if (fstat(fd, &st) == -1) { - pr_perror("amdgpu_plugin: fstat error"); + pr_perror("fstat error"); return -1; } ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { - pr_perror("amdgpu_plugin: fstat error for /dev/kfd"); + pr_perror("fstat error for /dev/kfd"); return -1; } @@ -1317,12 +1316,11 @@ int amdgpu_plugin_dump_file(int fd, int id) CriuRenderNode rd = CRIU_RENDER_NODE__INIT; struct tp_node *tp_node; - pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), - fd, id); + pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id); tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); if (!tp_node) { - pr_err("amdgpu_plugin: Failed to find a device with minor number = %d\n", minor(st.st_rdev)); + pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev)); return -ENODEV; } @@ -1350,7 +1348,7 @@ int amdgpu_plugin_dump_file(int fd, int id) return ret; } - pr_info("amdgpu_plugin: %s : %s() called for fd = %d\n", CR_PLUGIN_DESC.name, __func__, major(st.st_rdev)); + pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); /* KFD only allows ioctl calls from the same process that opened the KFD file descriptor. * The existing /dev/kfd file descriptor that is passed in is only allowed to do IOCTL calls with @@ -1362,13 +1360,13 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_PROCESS_INFO; if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call process info ioctl"); + pr_perror("Failed to call process info ioctl"); ret = -1; goto exit; } - pr_info("amdgpu_plugin: devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, - args.num_objects, args.priv_data_size); + pr_info("devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, args.num_objects, + args.priv_data_size); e = xmalloc(sizeof(*e)); if (!e) { @@ -1401,7 +1399,7 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_CHECKPOINT; ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to call dumper (process) ioctl"); + pr_perror("Failed to call dumper (process) ioctl"); goto exit; } @@ -1423,11 +1421,11 @@ int amdgpu_plugin_dump_file(int fd, int id) goto exit; snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); - pr_info("amdgpu_plugin: img_path = %s\n", img_path); + pr_info("img_path = %s\n", img_path); len = criu_kfd__get_packed_size(e); - pr_info("amdgpu_plugin: Len = %ld\n", len); + pr_info("Len = %ld\n", len); buf = xmalloc(len); if (!buf) { @@ -1453,9 +1451,9 @@ int amdgpu_plugin_dump_file(int fd, int id) free_e(e); if (ret) - pr_err("amdgpu_plugin: Failed to dump (ret:%d)\n", ret); + pr_err("Failed to dump (ret:%d)\n", ret); else - pr_info("amdgpu_plugin: Dump successful\n"); + pr_info("Dump successful\n"); return ret; } @@ -1501,10 +1499,10 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) device_bucket->drm_fd = node_get_drm_render_device(tp_node); if (device_bucket->drm_fd < 0) { - pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver"); + pr_perror("Can't pass NULL drm render fd to driver"); goto exit; } else { - pr_info("amdgpu_plugin: passing drm render fd = %d to driver\n", device_bucket->drm_fd); + pr_info("passing drm render fd = %d to driver\n", device_bucket->drm_fd); } } @@ -1588,7 +1586,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf vma_md->new_pgoff = bo_bucket->restored_offset; vma_md->fd = node_get_drm_render_device(tp_node); - plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx " + plugin_log_msg("adding vma_entry:addr:0x%lx old-off:0x%lx " "new_off:0x%lx new_minor:%d\n", vma_md->vma_entry, vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor); @@ -1669,7 +1667,7 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; - pr_info("amdgpu_plugin: Initialized kfd plugin restorer with ID = %d\n", id); + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1713,7 +1711,7 @@ int amdgpu_plugin_restore_file(int id) } fclose(img_fp); - pr_info("amdgpu_plugin: render node gpu_id = 0x%04x\n", rd->gpu_id); + pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id); target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id); if (!target_gpu_id) { @@ -1727,11 +1725,11 @@ int amdgpu_plugin_restore_file(int id) goto fail; } - pr_info("amdgpu_plugin: render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); + pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); fd = node_get_drm_render_device(tp_node); if (fd < 0) - pr_err("amdgpu_plugin: Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); @@ -1752,7 +1750,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - pr_info("amdgpu_plugin: Opened kfd, fd = %d\n", fd); + pr_info("Opened kfd, fd = %d\n", fd); if (!kernel_supports_criu(fd)) return -ENOTSUP; @@ -1780,7 +1778,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - plugin_log_msg("amdgpu_plugin: read image file data\n"); + plugin_log_msg("read image file data\n"); /* * Initialize fd_next to be 1 greater than the biggest file descriptor in use by the target restore process. @@ -1847,10 +1845,10 @@ int amdgpu_plugin_restore_file(int id) xfree(buf); if (ret) { - pr_err("amdgpu_plugin: Failed to restore (ret:%d)\n", ret); + pr_err("Failed to restore (ret:%d)\n", ret); fd = ret; } else { - pr_info("amdgpu_plugin: Restore successful (fd:%d)\n", fd); + pr_info("Restore successful (fd:%d)\n", fd); } return fd; @@ -1870,7 +1868,7 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; - plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__); + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1908,8 +1906,8 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const else *updated_fd = -1; - plugin_log_msg("amdgpu_plugin: old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, - vma_md->new_pgoff, *updated_fd); + plugin_log_msg("old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, vma_md->new_pgoff, + *updated_fd); return 1; } @@ -1924,7 +1922,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) struct kfd_ioctl_criu_args args = { 0 }; int fd, ret = 0; - pr_info("amdgpu_plugin: Inside %s for target pid = %d\n", __func__, target_pid); + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); if (fd < 0) { @@ -1934,7 +1932,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) args.pid = target_pid; args.op = KFD_CRIU_OP_RESUME; - pr_info("amdgpu_plugin: Calling IOCTL to start notifiers and queues\n"); + pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { pr_perror("restore late ioctl failed"); ret = -1; From 6fc5bc668f1556a401bfbff88df17c08ad27407f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 7 Aug 2023 15:45:37 -0700 Subject: [PATCH 112/321] scripts/apt: don't hide apt output It is required to investigate issues. Signed-off-by: Andrei Vagin --- scripts/ci/apt-install | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/apt-install b/scripts/ci/apt-install index 45aca13f40..676e0f7949 100755 --- a/scripts/ci/apt-install +++ b/scripts/ci/apt-install @@ -15,7 +15,7 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends "$@" && break + apt-get update -y && apt-get install -y --no-install-recommends "$@" && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" From 21992206b2e049f0e41efcfa46f9cdbba0622ca8 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 7 Aug 2023 16:00:39 -0700 Subject: [PATCH 113/321] ci/docker: install all required packages This change fixes the issue: ``` The following packages have unmet dependencies: docker-ce : Depends: containerd.io (>= 1.6.4) E: Unable to correct problems, you have held broken packages. ``` Signed-off-by: Andrei Vagin --- scripts/ci/docker-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index bd46d5dd31..22d326a371 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -15,7 +15,7 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce +./apt-install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin # shellcheck source=/dev/null . /etc/lsb-release From e1cda9f074b853c7a2d6cd2c112ce1950bb1e451 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 13 Aug 2023 12:21:35 +0100 Subject: [PATCH 114/321] lib/py: add VMA_AREA_MEMFD constant The VMA_AREA_MEMFD constant was introduced with commit 29a1a88bcebaf9d83591077d2bec424da82c0e71 memfd: add memory mapping support This patch extends the status map used in CRIT and coredump with the value of this constant to recognize it. Signed-off-by: Radostin Stoyanov --- coredump/criu_coredump/coredump.py | 1 + lib/py/images/pb2dict.py | 1 + 2 files changed, 2 insertions(+) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 0b8a02e0aa..20ec8e5dc8 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -53,6 +53,7 @@ "VMA_AREA_SOCKET": 1 << 11, "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, + "VMA_AREA_MEMFD": 1 << 14, "VMA_AREA_UNSUPP": 1 << 31 } diff --git a/lib/py/images/pb2dict.py b/lib/py/images/pb2dict.py index c7046429e0..fe41642d55 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/py/images/pb2dict.py @@ -102,6 +102,7 @@ def _custom_conv(field): ('VMA_AREA_SOCKET', 1 << 11), ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), + ('VMA_AREA_MEMFD', 1 << 14), ('VMA_UNSUPP', 1 << 31), ] From 288d6a61e29de9be17bc8c47169e4a907d01d60c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 21 Aug 2023 08:10:37 -0700 Subject: [PATCH 115/321] loongarch64: reformat syscall_64.tbl for 8-wide tabs Signed-off-by: Andrei Vagin --- .../plugins/std/syscalls/syscall_64.tbl | 228 +++++++++--------- 1 file changed, 114 insertions(+), 114 deletions(-) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl index b37a22674e..a0ad0cef4b 100644 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -5,117 +5,117 @@ # # __NR_name code name arguments # ------------------------------------------------------------------------------------------------------------------------------------------------------------- -__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) -__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) -__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) -__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) -__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) -__NR_flock 32 sys_flock (int fd, unsigned long cmd) -__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) -__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) -__NR_umount2 39 sys_umount2 (char *name, int flags) -__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) -__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) -__NR_close 57 sys_close (int fd) -__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) -__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) -__NR_read 63 sys_read (int fd, void *buf, unsigned long count) -__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) -__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) -__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) -__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) -__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) -__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) -__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) -__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) -__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) -__NR_personality 92 sys_personality (unsigned int personality) -__NR_exit 93 sys_exit (unsigned long error_code) -__NR_exit_group 94 sys_exit_group (int error_code) -__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) -__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) -__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) -__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) -__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) -__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) -__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) -__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) -__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) -__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) -__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) -__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) -__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) -__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) -__NR_restart_syscall 128 sys_restart_syscall (void) -__NR_kill 129 sys_kill (long pid, int sig) -__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) -__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) -__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) -__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) -__NR_rt_sigreturn 139 sys_rt_sigreturn (void) -__NR_setpriority 140 sys_setpriority (int which, int who, int nice) -__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) -__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) -__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) -__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) -__NR_getpgid 155 sys_getpgid (pid_t pid) -__NR_setfsuid 151 sys_setfsuid (int fsuid) -__NR_setfsgid 152 sys_setfsgid (int fsgid) -__NR_getsid 156 sys_getsid (void) -__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) -__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) -__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) -__NR_umask 166 sys_umask (int mask) -__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) -__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) -__NR_getpid 172 sys_getpid (void) -__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) -__NR_gettid 178 sys_gettid (void) -__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) -__NR_socket 198 sys_socket (int domain, int type, int protocol) -__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) -__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) -__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) -__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) -__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) -__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) -__NR_shutdown 210 sys_shutdown (int sockfd, int how) -__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) -__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) -__NR_brk 214 sys_brk (void *addr) -__NR_munmap 215 sys_munmap (void *addr, unsigned long len) -__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) -__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) -__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) -__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) -__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) -__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) -__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) -__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) -__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) -__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) -__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) -__NR_setns 268 sys_setns (int fd, int nstype) -__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) -__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) -__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) -__NR_userfaultfd 282 sys_userfaultfd (int flags) -__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) -__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) -__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) -__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) -__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) -__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) -#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) -#__NR_rmdir ! sys_rmdir (const char *name) -#__NR_unlink ! sys_unlink (char *pathname) -#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) -#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) -#__NR_mkdir ! sys_mkdir (const char *name, int mode) -#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) +__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_fcntl 2 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 2 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 3 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 3 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 3 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 3 sys_umount2 (char *name, int flags) +__NR_mount 4 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 4 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 5 sys_close (int fd) +__NR_openat 5 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 6 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 6 sys_read (int fd, void *buf, unsigned long count) +__NR_write 6 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 6 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 6 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 7 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 7 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 7 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 7 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 8 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 9 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 9 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 9 sys_personality (unsigned int personality) +__NR_exit 9 sys_exit (unsigned long error_code) +__NR_exit_group 9 sys_exit_group (int error_code) +__NR_waitid 9 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 9 sys_set_tid_address (int *tid_addr) +__NR_futex 9 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 9 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 1 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 1 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 1 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 1 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 1 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 1 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 1 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 1 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 1 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 1 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_sched_setscheduler 1 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 1 sys_restart_syscall (void) +__NR_kill 1 sys_kill (long pid, int sig) +__NR_sigaltstack 1 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 1 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 1 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 1 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 1 sys_rt_sigreturn (void) +__NR_setpriority 1 sys_setpriority (int which, int who, int nice) +__NR_setresuid 1 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 1 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 1 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 1 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 1 sys_getpgid (pid_t pid) +__NR_setfsuid 1 sys_setfsuid (int fsuid) +__NR_setfsgid 1 sys_setfsgid (int fsgid) +__NR_getsid 1 sys_getsid (void) +__NR_getgroups 1 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 1 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 1 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 1 sys_umask (int mask) +__NR_prctl 1 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 1 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 1 sys_getpid (void) +__NR_ptrace 1 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 1 sys_gettid (void) +__NR_shmat 1 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 1 sys_socket (int domain, int type, int protocol) +__NR_bind 2 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 2 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 2 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 2 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 2 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 2 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 2 sys_shutdown (int sockfd, int how) +__NR_sendmsg 2 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 2 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 2 sys_brk (void *addr) +__NR_munmap 2 sys_munmap (void *addr, unsigned long len) +__NR_mremap 2 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 2 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 2 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 2 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 2 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 2 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 2 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 2 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 2 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 2 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 2 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 2 sys_setns (int fd, int nstype) +__NR_kcmp 2 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 2 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 2 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 2 sys_userfaultfd (int flags) +__NR_rseq 2 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 4 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 4 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 4 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 4 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 4 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 4 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 4 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 4 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 4 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) +#__NR_rmdir ! sys_rmdir (const char *name) +#__NR_unlink ! sys_unlink (char *pathname) +#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) +#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) +#__NR_mkdir ! sys_mkdir (const char *name, int mode) +#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) From 2f50da4389a588ae675830788f9de09ed64fa578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 6 Oct 2022 17:52:46 +0200 Subject: [PATCH 116/321] dump+restore: Implement membarrier() registration c/r. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: Silently drops MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED as it's not currently detectable. This is still better than silently dropping all membarrier() registrations. Signed-off-by: MichaÅ‚ MirosÅ‚aw --- .../arch/arm/plugins/std/syscalls/syscall.def | 1 + .../plugins/std/syscalls/syscall_64.tbl | 1 + .../mips/plugins/std/syscalls/syscall_64.tbl | 1 + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + .../plugins/std/syscalls/syscall-s390.tbl | 1 + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + criu/cr-dump.c | 5 ++ criu/cr-restore.c | 3 ++ criu/include/parasite.h | 1 + criu/include/restorer.h | 1 + criu/pie/parasite.c | 50 +++++++++++++++++++ criu/pie/restorer.c | 27 ++++++++++ images/core.proto | 2 + 14 files changed, 96 insertions(+) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 8bcc3cc50a..7489ee0c11 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -122,3 +122,4 @@ pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl index a0ad0cef4b..f844d898dd 100644 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -102,6 +102,7 @@ __NR_kcmp 2 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, u __NR_seccomp 2 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 2 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 2 sys_userfaultfd (int flags) +__NR_membarrier 3 sys_membarrier (int cmd, unsigned int flags, int cpu_id) __NR_rseq 2 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_open_tree 4 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) __NR_move_mount 4 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 505ec849d7..9f50d5e8ad 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -119,3 +119,4 @@ __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 5318 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index af40d71045..4c9b75cf1b 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -118,3 +118,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 365 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index 6a349e1cb7..af7d550e2c 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -118,3 +118,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 356 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index a119a59b2e..ab36a5cd6f 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -106,3 +106,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 375 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 16dd86e791..57681b79a7 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -117,3 +117,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 340fb96ecd..ee5974acc9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -770,6 +770,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->tc->child_subreaper = misc->child_subreaper; core->tc->has_child_subreaper = true; + if (misc->membarrier_registration_mask) { + core->tc->membarrier_registration_mask = misc->membarrier_registration_mask; + core->tc->has_membarrier_registration_mask = true; + } + ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9107a23226..2700497216 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -863,6 +863,9 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a if (tc->has_child_subreaper) args->child_subreaper = tc->child_subreaper; + if (tc->has_membarrier_registration_mask) + args->membarrier_registration_mask = tc->membarrier_registration_mask; + /* loginuid value is critical to restore */ if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { ret = prepare_loginuid(tc->loginuid); diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 739fbf2c37..5209b6da22 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -128,6 +128,7 @@ struct parasite_dump_misc { int dumpable; int thp_disabled; int child_subreaper; + int membarrier_registration_mask; }; /* diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 2475ee0bcb..f398d8d8fe 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -229,6 +229,7 @@ struct task_restore_args { #endif int lsm_type; int child_subreaper; + int membarrier_registration_mask; bool has_clone3_set_tid; /* diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 58ea35892e..c0604903b9 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -211,6 +211,42 @@ static int dump_thread_common(struct parasite_dump_thread *ti) return ret; } +/* + * Returns a membarrier() registration command (it is a bitmask) if the process + * was registered for specified (as a bit index) membarrier()-issuing command; + * returns zero otherwise. + */ +static int get_membarrier_registration_mask(int cmd_bit) +{ + unsigned cmd = 1 << cmd_bit; + int ret; + + /* + * Issuing a barrier will be successful only if the process was registered + * for this type of membarrier. All errors are a sign that the type issued + * was not registered (EPERM) or not supported by kernel (EINVAL or ENOSYS). + */ + ret = sys_membarrier(cmd, 0, 0); + if (ret && ret != -EPERM && ret != -EINVAL && ret != -ENOSYS) { + pr_err("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + return -1; + } + pr_debug("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + /* + * For supported registrations, MEMBARRIER_CMD_REGISTER_xxx = MEMBARRIER_CMD_xxx << 1. + * See: enum membarrier_cmd in include/uapi/linux/membarrier.h in kernel sources. + */ + return ret ? 0 : cmd << 1; +} + +/* + * It would be better to check the following with BUILD_BUG_ON, but we might + * have an old linux/membarrier.h header without necessary enum values. + */ +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED 3 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE 5 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ 7 + static int dump_misc(struct parasite_dump_misc *args) { int ret; @@ -225,6 +261,20 @@ static int dump_misc(struct parasite_dump_misc *args) args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + args->membarrier_registration_mask = 0; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); + if (ret < 0) + return -1; + args->membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); + if (ret < 0) + return -1; + args->membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); + if (ret < 0) + return -1; + args->membarrier_registration_mask |= ret; + ret = sys_prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&args->child_subreaper, 0, 0, 0); if (ret) pr_err("PR_GET_CHILD_SUBREAPER failed (%d)\n", ret); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index d4f77bfdee..bbee0f6fb4 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1537,6 +1537,30 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) return 0; } +/* + * Restore membarrier() registrations. + */ +static int restore_membarrier_registrations(int mask) +{ + unsigned long bitmap[1] = { mask }; + int i, err, ret = 0; + + if (!mask) + return 0; + + pr_info("Restoring membarrier() registrations %x\n", mask); + + for_each_bit(i, bitmap) { + err = sys_membarrier(1 << i, 0, 0); + if (!err) + continue; + pr_err("Can't restore membarrier(1 << %d) registration: %d\n", i, err); + ret = -1; + } + + return ret; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -2023,6 +2047,9 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } + if (restore_membarrier_registrations(args->membarrier_registration_mask) < 0) + goto core_restore_end; + pr_info("%ld: Restored\n", sys_getpid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE); diff --git a/images/core.proto b/images/core.proto index 1882fe8e42..5b07b5c448 100644 --- a/images/core.proto +++ b/images/core.proto @@ -64,6 +64,8 @@ message task_core_entry { optional uint64 blk_sigset_extended = 20[(criu).hex = true]; optional uint32 stop_signo = 21; + + optional uint32 membarrier_registration_mask = 22 [(criu).hex = true]; } message task_kobj_ids_entry { From b5c3cccf44130bb11ab830e9a5d3637e366fa29a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 19 Jun 2023 12:00:51 +0200 Subject: [PATCH 117/321] zdtm: membarrier: test migration of membarrier() registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- test/zdtm/static/Makefile | 1 + test/zdtm/static/membarrier.c | 116 ++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 test/zdtm/static/membarrier.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 30429e425a..cd53932db4 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -62,6 +62,7 @@ TST_NOFILE := \ pthread_timers \ pthread_timers_h \ rseq00 \ + membarrier \ vdso00 \ vdso01 \ vdso02 \ diff --git a/test/zdtm/static/membarrier.c b/test/zdtm/static/membarrier.c new file mode 100644 index 0000000000..a04b360351 --- /dev/null +++ b/test/zdtm/static/membarrier.c @@ -0,0 +1,116 @@ +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test membarrier() migration"; +const char *test_author = "MichaÅ‚ MirosÅ‚aw "; + +/* + * Define membarrier() CMDs to avoid depending on exact kernel header version. + * FIXME: use MEMBARRIER_CMD_GET_REGISTRATIONS if supported by kernel. + */ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE (1 << 5) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE (1 << 6) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) + +static int membarrier(int cmd, unsigned int flags, int cpu_id) +{ + return syscall(__NR_membarrier, cmd, flags, cpu_id); +} + +static const struct { + const char *name_suffix; + int register_cmd; + int execute_cmd; +} membarrier_cmds[] = { + { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, + { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, + { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, +}; +static const int n_membarrier_cmds = sizeof(membarrier_cmds) / sizeof(*membarrier_cmds); + +static int register_membarriers(void) +{ + int barriers_supported, barriers_registered; + bool all_ok = true; + + barriers_supported = membarrier(MEMBARRIER_CMD_QUERY, 0, 0); + if (barriers_supported < 0) { + fail("membarrier() not supported by running kernel"); + return -1; + } + + barriers_registered = 0; + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_supported & membarrier_cmds[i].register_cmd) + continue; + + barriers_registered |= membarrier_cmds[i].execute_cmd; + + if (membarrier(membarrier_cmds[i].register_cmd, 0, 0) < 0) { + pr_perror("membarrier(REGISTER_PRIVATE_EXPEDITED%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) { + fail("can't register membarrier()s - tried %#x, kernel %#x", + barriers_registered, barriers_supported); + return -1; + } + + if (!barriers_registered) { + fail("no known membarrier() cmds are supported by the kernel"); + return -1; + } + + return barriers_registered; +} + +static bool check_membarriers(int barriers_registered) +{ + bool all_ok = true; + + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_registered & membarrier_cmds[i].execute_cmd) + continue; + if (membarrier(membarrier_cmds[i].execute_cmd, 0, 0) < 0) { + pr_perror("membarrier(PRIVATE_EXPEDITED%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) + fail("membarrier() check failed"); + + return all_ok; +} + +int main(int argc, char **argv) +{ + int barriers_registered; + + test_init(argc, argv); + + barriers_registered = register_membarriers(); + if (barriers_registered < 0) + return 1; + + test_msg("Pre-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + test_daemon(); + test_waitsig(); + + test_msg("Post-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + pass(); + return 0; +} From 5fedcaaa5aa53d7ee8761b29e0c26cb20b7f7472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Wed, 9 Aug 2023 14:42:22 +0200 Subject: [PATCH 118/321] Put a cap on the size of single preadv in restore operation. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While each preadv() is followed by a fallocate() that removes the data range from image files on tmpfs, temporarily (between preadv() and fallocate()) the same data is in two places; this increases the memory overhead of restore operation by the size of a single preadv. Uncapped preadv() would read up to 2 GiB of data, thus we limit that to a smaller block size (128 MiB). Based-on-work-by: PaweÅ‚ Stradomski Signed-off-by: Michał MirosÅ‚aw --- criu/pie/restorer.c | 47 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index bbee0f6fb4..0de2423a15 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -56,6 +56,12 @@ */ #define MAX_GETGROUPS_CHECKED (512 / sizeof(unsigned int)) +/* + * Memory overhead limit for reading VMA when auto_dedup is enabled. + * An arbitrarily chosen trade-off point between speed and memory usage. + */ +#define AUTO_DEDUP_OVERHEAD_BYTES (128 << 20) + #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 #endif @@ -1477,6 +1483,40 @@ static int fd_poll(int inotify_fd) return sys_ppoll(&pfd, 1, &tmo, NULL, sizeof(sigset_t)); } +/* + * Call preadv() but limit size of the read. Zero `max_to_read` skips the limit. + */ +static ssize_t preadv_limited(int fd, struct iovec *iovs, int nr, off_t offs, size_t max_to_read) +{ + size_t saved_last_iov_len = 0; + ssize_t ret; + + if (max_to_read) { + for (int i = 0; i < nr; ++i) { + if (iovs[i].iov_len <= max_to_read) { + max_to_read -= iovs[i].iov_len; + continue; + } + + if (!max_to_read) { + nr = i; + break; + } + + saved_last_iov_len = iovs[i].iov_len; + iovs[i].iov_len = max_to_read; + nr = i + 1; + break; + } + } + + ret = sys_preadv(fd, iovs, nr, offs); + if (saved_last_iov_len) + iovs[nr - 1].iov_len = saved_last_iov_len; + + return ret; +} + /* * In the worst case buf size should be: * sizeof(struct inotify_event) * 2 + PATH_MAX @@ -1748,7 +1788,12 @@ long __export_restore_task(struct task_restore_args *args) while (nr) { pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); - r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); + /* + * If we're requested to punch holes in the file after reading we do + * it to save memory. Limit the reads then to an arbitrary block size. + */ + r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, + args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); if (r < 0) { pr_err("Can't read pages data (%d)\n", (int)r); goto core_restore_end; From 649292c1b0b8a8d2531bbf81a9456d444bdb78d6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 19 Aug 2023 21:56:25 -0700 Subject: [PATCH 119/321] github: auto-remove `changes requested` and `awaiting reply` labels Labels are removed when new comments are posted. Signed-off-by: Andrei Vagin --- .github/workflows/manage-labels.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/manage-labels.yml diff --git a/.github/workflows/manage-labels.yml b/.github/workflows/manage-labels.yml new file mode 100644 index 0000000000..a2bcd88604 --- /dev/null +++ b/.github/workflows/manage-labels.yml @@ -0,0 +1,14 @@ +name: Remove labels +on: [issue_comment, pull_request_review_comment] +jobs: + remove-labels-on-comments: + name: Remove labels on comments + if: github.event_name == 'issue_comment' + runs-on: ubuntu-latest + steps: + - uses: mondeja/remove-labels-gh-action@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + labels: | + changes requested + awaiting reply From 942b5fdcc6f12b08fbd496f0f5a08b807f224b94 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 24 Aug 2023 13:52:21 -0700 Subject: [PATCH 120/321] loongarch64: fix syscall_64.tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 288d6a61e29d change broke all the syscall numbers. Reported-by: Michał MirosÅ‚aw Fixes: (288d6a61e29d "loongarch64: reformat syscall_64.tbl for 8-wide tabs") Signed-off-by: Andrei Vagin --- .../plugins/std/syscalls/syscall_64.tbl | 210 +++++++++--------- 1 file changed, 105 insertions(+), 105 deletions(-) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl index f844d898dd..aa6ffb44d1 100644 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -8,111 +8,111 @@ __NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) __NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) -__NR_fcntl 2 sys_fcntl (int fd, int type, long arg) -__NR_ioctl 2 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) -__NR_flock 3 sys_flock (int fd, unsigned long cmd) -__NR_mkdirat 3 sys_mkdirat (int dfd, const char *pathname, int flag) -__NR_unlinkat 3 sys_unlinkat (int dfd, const char *pathname, int flag) -__NR_umount2 3 sys_umount2 (char *name, int flags) -__NR_mount 4 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) -__NR_fallocate 4 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) -__NR_close 5 sys_close (int fd) -__NR_openat 5 sys_openat (int dfd, const char *filename, int flags, int mode) -__NR_lseek 6 sys_lseek (int fd, unsigned long offset, unsigned long origin) -__NR_read 6 sys_read (int fd, void *buf, unsigned long count) -__NR_write 6 sys_write (int fd, const void *buf, unsigned long count) -__NR_pread64 6 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) -__NR_preadv 6 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) -__NR_ppoll 7 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_signalfd4 7 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) -__NR_vmsplice 7 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) -__NR_readlinkat 7 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) -__NR_timerfd_settime 8 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) -__NR_capget 9 sys_capget (struct cap_header *h, struct cap_data *d) -__NR_capset 9 sys_capset (struct cap_header *h, struct cap_data *d) -__NR_personality 9 sys_personality (unsigned int personality) -__NR_exit 9 sys_exit (unsigned long error_code) -__NR_exit_group 9 sys_exit_group (int error_code) -__NR_waitid 9 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) -__NR_set_tid_address 9 sys_set_tid_address (int *tid_addr) -__NR_futex 9 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) -__NR_set_robust_list 9 sys_set_robust_list (struct robust_list_head *head, size_t len) -__NR_get_robust_list 1 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) -__NR_nanosleep 1 sys_nanosleep (struct timespec *req, struct timespec *rem) -__NR_getitimer 1 sys_getitimer (int which, const struct itimerval *val) -__NR_setitimer 1 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) -__NR_sys_timer_create 1 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) -__NR_sys_timer_gettime 1 sys_timer_gettime (int timer_id, const struct itimerspec *setting) -__NR_sys_timer_getoverrun 1 sys_timer_getoverrun (int timer_id) -__NR_sys_timer_settime 1 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) -__NR_sys_timer_delete 1 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 1 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) -__NR_sched_setscheduler 1 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) -__NR_restart_syscall 1 sys_restart_syscall (void) -__NR_kill 1 sys_kill (long pid, int sig) -__NR_sigaltstack 1 sys_sigaltstack (const void *uss, void *uoss) -__NR_rt_sigaction 1 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) -__NR_rt_sigprocmask 1 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) -__NR_rt_sigqueueinfo 1 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) -__NR_rt_sigreturn 1 sys_rt_sigreturn (void) -__NR_setpriority 1 sys_setpriority (int which, int who, int nice) -__NR_setresuid 1 sys_setresuid (int uid, int euid, int suid) -__NR_getresuid 1 sys_getresuid (int *uid, int *euid, int *suid) -__NR_setresgid 1 sys_setresgid (int gid, int egid, int sgid) -__NR_getresgid 1 sys_getresgid (int *gid, int *egid, int *sgid) -__NR_getpgid 1 sys_getpgid (pid_t pid) -__NR_setfsuid 1 sys_setfsuid (int fsuid) -__NR_setfsgid 1 sys_setfsgid (int fsgid) -__NR_getsid 1 sys_getsid (void) -__NR_getgroups 1 sys_getgroups (int gsize, unsigned int *groups) -__NR_setgroups 1 sys_setgroups (int gsize, unsigned int *groups) -__NR_setrlimit 1 sys_setrlimit (int resource, struct krlimit *rlim) -__NR_umask 1 sys_umask (int mask) -__NR_prctl 1 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) -__NR_gettimeofday 1 sys_gettimeofday (struct timeval *tv, struct timezone *tz) -__NR_getpid 1 sys_getpid (void) -__NR_ptrace 1 sys_ptrace (long request, pid_t pid, void *addr, void *data) -__NR_gettid 1 sys_gettid (void) -__NR_shmat 1 sys_shmat (int shmid, void *shmaddr, int shmflag) -__NR_socket 1 sys_socket (int domain, int type, int protocol) -__NR_bind 2 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) -__NR_connect 2 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) -__NR_sendto 2 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) -__NR_recvfrom 2 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) -__NR_setsockopt 2 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) -__NR_getsockopt 2 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) -__NR_shutdown 2 sys_shutdown (int sockfd, int how) -__NR_sendmsg 2 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) -__NR_recvmsg 2 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) -__NR_brk 2 sys_brk (void *addr) -__NR_munmap 2 sys_munmap (void *addr, unsigned long len) -__NR_mremap 2 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) -__NR_clone 2 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) -__NR_mmap 2 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) -__NR_mprotect 2 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) -__NR_mincore 2 sys_mincore (void *addr, unsigned long size, unsigned char *vec) -__NR_madvise 2 sys_madvise (unsigned long start, size_t len, int behavior) -__NR_rt_tgsigqueueinfo 2 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) -__NR_wait4 2 sys_wait4 (int pid, int *status, int options, struct rusage *ru) -__NR_fanotify_init 2 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) -__NR_fanotify_mark 2 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) -__NR_open_by_handle_at 2 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) -__NR_setns 2 sys_setns (int fd, int nstype) -__NR_kcmp 2 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) -__NR_seccomp 2 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) -__NR_memfd_create 2 sys_memfd_create (const char *name, unsigned int flags) -__NR_userfaultfd 2 sys_userfaultfd (int flags) -__NR_membarrier 3 sys_membarrier (int cmd, unsigned int flags, int cpu_id) -__NR_rseq 2 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_open_tree 4 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 4 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) -__NR_fsopen 4 sys_fsopen (char *fsname, unsigned int flags) -__NR_fsconfig 4 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) -__NR_fsmount 4 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) -__NR_pidfd_open 4 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_clone3 4 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_openat2 4 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) -__NR_pidfd_getfd 4 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 32 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 39 sys_umount2 (char *name, int flags) +__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 57 sys_close (int fd) +__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 63 sys_read (int fd, void *buf, unsigned long count) +__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 92 sys_personality (unsigned int personality) +__NR_exit 93 sys_exit (unsigned long error_code) +__NR_exit_group 94 sys_exit_group (int error_code) +__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) +__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 128 sys_restart_syscall (void) +__NR_kill 129 sys_kill (long pid, int sig) +__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 139 sys_rt_sigreturn (void) +__NR_setpriority 140 sys_setpriority (int which, int who, int nice) +__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 155 sys_getpgid (pid_t pid) +__NR_setfsuid 151 sys_setfsuid (int fsuid) +__NR_setfsgid 152 sys_setfsgid (int fsgid) +__NR_getsid 156 sys_getsid (void) +__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 166 sys_umask (int mask) +__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 172 sys_getpid (void) +__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 178 sys_gettid (void) +__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 198 sys_socket (int domain, int type, int protocol) +__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 210 sys_shutdown (int sockfd, int how) +__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 214 sys_brk (void *addr) +__NR_munmap 215 sys_munmap (void *addr, unsigned long len) +__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 268 sys_setns (int fd, int nstype) +__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 282 sys_userfaultfd (int flags) +__NR_membarrier 283 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) #__NR_dup2 ! sys_dup2 (int oldfd, int newfd) #__NR_rmdir ! sys_rmdir (const char *name) #__NR_unlink ! sys_unlink (char *pathname) From 359b25766f2268383c9c6519ac7cb74a407042d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 24 Aug 2023 21:07:51 +0200 Subject: [PATCH 121/321] memfd: don't set fd attributes not needed for vma mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is only one user of memfd_open() outside of memfd.c: open_filemap(). It is restoring a file-backed mapping and doesn't need nor expect to update F_SETOWN nor the fd's position. Check the inherited_fd() handling in the callers to simplify the code. Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/files-reg.c | 3 ++- criu/memfd.c | 46 +++++++++++++++++++++++----------------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index 50dcbc4386..cf0c84b52e 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2508,7 +2508,8 @@ static int open_filemap(int pid, struct vma_area *vma) */ ret = dup(plugin_fd); } else if (vma->e->status & VMA_AREA_MEMFD) { - ret = memfd_open(vma->vmfd, &flags); + if (!inherited_fd(vma->vmfd, &ret)) + ret = memfd_open(vma->vmfd, &flags); } else { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); } diff --git a/criu/memfd.c b/criu/memfd.c index 2158b67206..5fe0aeae38 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -331,14 +331,11 @@ int memfd_open(struct file_desc *d, u32 *fdflags) mfi = container_of(d, struct memfd_info, d); mfe = mfi->mfe; - if (inherited_fd(d, &fd)) - return fd; - pr_info("Restoring memfd id=%d\n", mfe->id); fd = memfd_open_inode(mfi->inode); if (fd < 0) - goto err; + return -1; /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; @@ -348,40 +345,43 @@ int memfd_open(struct file_desc *d, u32 *fdflags) * important though. */ _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); - if (_fd < 0) { + if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); - goto err; - } + close(fd); - fd = _fd; + return _fd; +} + +static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) +{ + MemfdFileEntry *mfe; + int fd; + + if (inherited_fd(d, new_fd)) + return 0; + + fd = memfd_open(d, NULL); + if (fd < 0) + return -1; + + mfe = container_of(d, struct memfd_info, d)->mfe; if (restore_fown(fd, mfe->fown) < 0) goto err; if (lseek(fd, mfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file position of memfd id=%d", mfe->id); + pr_perror("Can't restore file position of %d for memfd id=%d", fd, mfe->id); goto err; } - return fd; + *new_fd = fd; + return 0; err: - if (fd >= 0) - close(fd); + close(fd); return -1; } -static int memfd_open_fe_fd(struct file_desc *fd, int *new_fd) -{ - int tmp; - - tmp = memfd_open(fd, NULL); - if (tmp < 0) - return -1; - *new_fd = tmp; - return 0; -} - static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) { MemfdInodeEntry *mie = NULL; From 675c5e46553d83d41f4b56517d45927564740d91 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 24 Aug 2023 15:22:32 -0700 Subject: [PATCH 122/321] ci/loongarch64: compile tests before running zdtm.py Otherwise tests fail by timeout. Signed-off-by: Andrei Vagin --- scripts/ci/loongarch64-qemu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh index 52e587619c..d5646468e8 100755 --- a/scripts/ci/loongarch64-qemu-test.sh +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -65,5 +65,5 @@ sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -P $PORT criu.tar $USER@127 # build and test run 'cd /root; tar -xf criu.tar' -run 'cd /root/criu; make -j4' +run 'cd /root/criu; make -j4 && make -j4 -C test/zdtm' run "cd /root/criu; ./test/zdtm.py run -t zdtm/static/maps02 -t zdtm/static/maps05 -t zdtm/static/maps06 -t zdtm/static/maps10 -t zdtm/static/maps_file_prot -t zdtm/static/memfd00 -t zdtm/transition/fork -t zdtm/transition/fork2 -t zdtm/transition/shmem -f h" From af31e8e242b8a5175475383ecf6ce062910f2aec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 24 Aug 2023 20:21:28 +0200 Subject: [PATCH 123/321] kerndat: Make pagemap check more robust against swapped out pages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix test of whether the kernel exposes page frame numbers to cope with the possibility that the top of the stack is swapped out, which was happening in about one 1 out of 3 million runs. This lead to a later failure when trying to read the PFN of the zero page, after which criu would exit with no error message. Original-From: Ambrose Feinstein Signed-off-by: Michał MirosÅ‚aw --- criu/kerndat.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index bd1ccdc7d1..3ef080aca8 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -55,10 +55,11 @@ #include "util-caps.h" struct kerndat_s kdat = {}; +volatile int dummy_var; static int check_pagemap(void) { - int ret, fd; + int ret, fd, retry; u64 pfn = 0; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); @@ -72,11 +73,24 @@ static int check_pagemap(void) return -1; } - /* Get the PFN of some present page. Stack is here, so try it :) */ - ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); - if (ret != sizeof(pfn)) { - pr_perror("Can't read pagemap"); - return -1; + retry = 3; + while (retry--) { + ++dummy_var; + /* Get the PFN of a page likely to be present. */ + ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); + if (ret != sizeof(pfn)) { + pr_perror("Can't read pagemap"); + return -1; + } + /* The page can be swapped out by the time the read occurs, + * in which case the rest of the bits are a swap offset, + * and can't be used to determine whether PFNs are visible. + * Retry if this happens. */ + if (pfn & PME_PRESENT) + break; + pr_warn("got non-present PFN %#lx for the dummy data page; %s\n", (unsigned long)pfn, + retry ? "retrying" : "giving up"); + pfn = 0; } close(fd); From 38baf73b6d3e1409d817735f8eae62abbc1f373f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Tue, 22 Aug 2023 17:30:44 +0200 Subject: [PATCH 124/321] compel/infect: include the relevant pid in "no-breakpoints restore" debug message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- compel/src/lib/infect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 022d4ebf33..b9a913fa1e 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1577,7 +1577,7 @@ int compel_stop_pie(pid_t pid, void *addr, bool no_bp) int ret; if (no_bp) { - pr_debug("Force no-breakpoints restore\n"); + pr_debug("Force no-breakpoints restore of %d\n", pid); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); From ba27d27cd40993db3cacee2547b7583539328be6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 25 Aug 2023 13:36:21 +0200 Subject: [PATCH 125/321] proc_parse: remove trivial goto from vma_get_mapfile_user() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/proc_parse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 61c1eee240..2ac01d6ff4 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -338,7 +338,7 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("Can't open mapped [%s]", fname); - goto returnerr; + return -1; } if (vma_stat(vma, fd)) { @@ -379,7 +379,6 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); closefd: close(fd); -returnerr: return -1; } From 2df6ec519d3348a23bff32c7f337bda72d443ba2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 Mar 2023 20:21:04 +0000 Subject: [PATCH 126/321] test/other: add test for action-script This commit is introducing a test for the action-script functionality of CRIU to verify that pre-dump, post-dump, pre-restore, pre-resume, post-restore, post-resume hooks are executed during dump/restore. Signed-off-by: Radostin Stoyanov --- Makefile | 1 + scripts/ci/run-ci-tests.sh | 3 ++ test/others/action-script/.gitignore | 1 + test/others/action-script/Makefile | 5 ++ test/others/action-script/action-script.sh | 2 + test/others/action-script/run.sh | 60 ++++++++++++++++++++++ 6 files changed, 72 insertions(+) create mode 100644 test/others/action-script/.gitignore create mode 100644 test/others/action-script/Makefile create mode 100755 test/others/action-script/action-script.sh create mode 100755 test/others/action-script/run.sh diff --git a/Makefile b/Makefile index 9a297d2d83..e4ddd887fe 100644 --- a/Makefile +++ b/Makefile @@ -449,6 +449,7 @@ lint: shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh + shellcheck -x test/others/action-script/*.sh codespell -S tags # Do not append \n to pr_perror, pr_pwarn or fail ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 79744c7507..47749e7fa8 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -316,6 +316,9 @@ make -C test/others/ns_ext run # config file parser and parameter testing make -C test/others/config-file run +# action script testing +make -C test/others/action-script run + # Skip all further tests when running with GCOV=1 # The one test which currently cannot handle GCOV testing is compel/test # Probably because the GCOV Makefile infrastructure does not exist in compel diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore new file mode 100644 index 0000000000..c0b6a2490a --- /dev/null +++ b/test/others/action-script/.gitignore @@ -0,0 +1 @@ +img-dir-* diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile new file mode 100644 index 0000000000..f1ce191dbc --- /dev/null +++ b/test/others/action-script/Makefile @@ -0,0 +1,5 @@ +run: + @make -C .. loop + ./run.sh + +.PHONY: run diff --git a/test/others/action-script/action-script.sh b/test/others/action-script/action-script.sh new file mode 100755 index 0000000000..aba8292c05 --- /dev/null +++ b/test/others/action-script/action-script.sh @@ -0,0 +1,2 @@ +#!/bin/bash +touch action-hook-"$CRTOOLS_SCRIPT_ACTION" diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh new file mode 100755 index 0000000000..a82fccf359 --- /dev/null +++ b/test/others/action-script/run.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -ebm + +# shellcheck source=test/others/env.sh +source ../env.sh || exit 1 + +SELFDIR="$(dirname "$(readlink -f "$0")")" +SCRIPT="$SELFDIR/action-script.sh" +IMGDIR="$SELFDIR/img-dir-$$" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +trap "cleanup" QUIT TERM INT HUP EXIT + +# shellcheck disable=SC2317 +# https://github.com/koalaman/shellcheck/issues/2660 +function cleanup() +{ + if [[ -n "$PID" ]]; then + kill -9 "$PID" + fi +} + +PID=$(../loop) +if ! $CRIU dump -v4 -o dump.log -t "$PID" -D "$IMGDIR" --action-script "$SCRIPT"; then + echo "Failed to checkpoint process $PID" + cat dump.log + kill -9 "$PID" + exit 1 +fi + +if ! $CRIU restore -v4 -o restore.log -D "$IMGDIR" -d --pidfile test.pidfile --action-script "$SCRIPT"; then + echo "CRIU restore failed" + echo FAIL + exit 1 +fi + +PID=$(cat "$IMGDIR"/test.pidfile) + +found_missing_file=false +hooks=("pre-dump" "post-dump" "pre-restore" "pre-resume" "post-restore" "post-resume") + +for hook in "${hooks[@]}" +do + if [ ! -e "$IMGDIR/action-hook-$hook" ]; then + echo "ERROR: action-hook-$hook does not exist" + found_missing_file=true + fi +done + +if [ "$found_missing_file" = true ]; then + exit 1 +fi + +echo PASS + +rm -rf "$IMGDIR" +exit 0 From 92b96a55a67e0e8eba2da1259d8a661018da36b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 24 Aug 2023 21:20:01 +0200 Subject: [PATCH 127/321] proc_parse: Log smaps entry while dumping VMA. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Help debugging problems with restoring custom VMAs. From: MichaÅ‚ CÅ‚apiÅ„ski Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/proc_parse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 2ac01d6ff4..16392e3864 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -841,6 +841,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du goto err; } + pr_debug("Handling VMA with the following smaps entry: %s\n", str); if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; From a04fac57f71540654a7b475fc35ca150fe8f9bcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 24 Aug 2023 20:54:20 +0200 Subject: [PATCH 128/321] kerndat: Make errors from clone3() check more precise. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- criu/kerndat.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 3ef080aca8..9aa035420f 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1417,17 +1417,20 @@ static bool kerndat_has_clone3_set_tid(void) */ pid = syscall(__NR_clone3, &args, sizeof(args)); - if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) { - kdat.has_clone3_set_tid = false; - return 0; - } - if (pid == -1 && errno == EINVAL) { - kdat.has_clone3_set_tid = true; - } else { - pr_perror("Unexpected error from clone3"); + if (pid != -1) { + pr_err("Unexpected success: clone3() returned %d\n", pid); return -1; } + if (errno == ENOSYS || errno == E2BIG) + return 0; + + if (errno != EINVAL) { + pr_pwarn("Unexpected error from clone3"); + return 0; + } + + kdat.has_clone3_set_tid = true; return 0; } From d0f88ffdd5cc5ae7bc526043f2d331d816474399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 28 Aug 2023 14:16:54 +0200 Subject: [PATCH 129/321] kerndat: check_pagemap: close(fd) on error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plug a fd leak when returning error from check_pagemap(). (Cosmetic, as the process will exit soon anyway.) Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/kerndat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/kerndat.c b/criu/kerndat.c index 9aa035420f..083e64114f 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -80,6 +80,7 @@ static int check_pagemap(void) ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); if (ret != sizeof(pfn)) { pr_perror("Can't read pagemap"); + close(fd); return -1; } /* The page can be swapped out by the time the read occurs, From 6d0e785182934c4072aca6fe7fb17f036cb78c2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 28 Aug 2023 16:38:40 +0200 Subject: [PATCH 130/321] kerndat: check_pagemap: reword retried case explanation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: MichaÅ‚ MirosÅ‚aw --- criu/kerndat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 083e64114f..37b265d8de 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -84,8 +84,8 @@ static int check_pagemap(void) return -1; } /* The page can be swapped out by the time the read occurs, - * in which case the rest of the bits are a swap offset, - * and can't be used to determine whether PFNs are visible. + * in which case the rest of the bits are a swap type + offset + * (which could be zero even if not hidden). * Retry if this happens. */ if (pfn & PME_PRESENT) break; From e0d13ef05c04d0ac550b4981d89b860f835ba61b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 27 Jul 2023 21:56:17 +0200 Subject: [PATCH 131/321] memfd: return original memfd fd for execveat() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there is only a single RW opened fd for a memfd, it can be used to pass it to execveat() with AT_EMPTY_PATH to have its contents executed. This currently works only for the original fd from memfd_create(). For now we ignore processes that reopen the memfd's rw and expect a particular executability trait of it. (Note: for security purposes recent kernels have SEAL_EXEC to make memfds non-executable.) Signed-off-by: Michał MirosÅ‚aw --- criu/memfd.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/criu/memfd.c b/criu/memfd.c index 5fe0aeae38..a770c66a11 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -46,6 +46,7 @@ struct memfd_restore_inode { int fdstore_id; unsigned int pending_seals; MemfdInodeEntry *mie; + bool was_opened_rw; }; static LIST_HEAD(memfd_inodes); @@ -233,6 +234,7 @@ static int collect_one_memfd_inode(void *o, ProtobufCMessage *base, struct cr_im mutex_init(&inode->lock); inode->fdstore_id = -1; inode->pending_seals = 0; + inode->was_opened_rw = false; list_add_tail(&inode->list, &memfd_inodes); @@ -339,6 +341,24 @@ int memfd_open(struct file_desc *d, u32 *fdflags) /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; + + if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { + /* + * If there is only a single RW-opened fd for a memfd, it can + * be used to pass it to execveat() with AT_EMPTY_PATH to have + * its contents executed. This currently works only for the + * original fd from memfd_create() so return the original fd + * once -- in case the caller expects to be the sole opener + * and does execveat() from this memfd. + */ + if (!fcntl(fd, F_SETFL, flags)) { + mfi->inode->was_opened_rw = true; + return fd; + } + + pr_pwarn("Can't change fd flags to %#o for memfd id=%d", flags, mfe->id); + } + /* * Ideally we should call compat version open() to not force the * O_LARGEFILE file flag with regular open(). It doesn't seem that @@ -347,6 +367,8 @@ int memfd_open(struct file_desc *d, u32 *fdflags) _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); + else if ((flags & O_ACCMODE) == O_RDWR) + pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); close(fd); return _fd; From a652c686652df835ce9212fa5bca76f74cc3b284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 27 Jul 2023 17:33:42 +0200 Subject: [PATCH 132/321] zdtm: test execveat(memfd) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał MirosÅ‚aw --- test/zdtm/static/Makefile | 1 + test/zdtm/static/memfd04.c | 105 ++++++++++++++++++++++++++++++++++ test/zdtm/static/memfd04.desc | 1 + 3 files changed, 107 insertions(+) create mode 100644 test/zdtm/static/memfd04.c create mode 100644 test/zdtm/static/memfd04.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index cd53932db4..b7fb79643c 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -258,6 +258,7 @@ TST_NOFILE := \ memfd02 \ memfd02-hugetlb \ memfd03 \ + memfd04 \ shmemfd \ shmemfd-priv \ time \ diff --git a/test/zdtm/static/memfd04.c b/test/zdtm/static/memfd04.c new file mode 100644 index 0000000000..aae7864c10 --- /dev/null +++ b/test/zdtm/static/memfd04.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "exec(memfd)"; +const char *test_author = "MichaÅ‚ MirosÅ‚aw "; + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + +static int _execveat(int dirfd, const char *pathname, const char *const argv[], const char *const envp[], int flags) +{ + return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); +} + +static const char *const script_argv[] = { "true", NULL }; +static const char *const script_env[] = { NULL }; + +static bool test_exec_fd(int fd) +{ + int err, pid, status; + + err = fcntl(fd, F_GETFD); + if (err < 0) { + fail("fcntl(F_GETFD)"); + return false; + } + if (err) { + errno = 0; + fail("F_GETFD for the memfd returned %d but expected 0", err); + return false; + } + + pid = fork(); + if (!pid) { + _execveat(fd, "", script_argv, script_env, AT_EMPTY_PATH); + err = errno; + pr_perror("execveat()"); + _exit(err); + } + + if (pid < 0) { + fail("fork()"); + return false; + } + + while (waitpid(pid, &status, 0) != pid) { + if (errno == EINTR) + continue; + fail("waitpid(child=%d)", pid); + return false; + } + + if (status != 0) { + pr_err("child exited with status=%d\n", status); + return false; + } + + return true; +} + +static const char script[] = "#!/bin/true"; +static const size_t script_len = sizeof(script) - 1; + +int main(int argc, char *argv[]) +{ + int fd; + + test_init(argc, argv); + + fd = _memfd_create("somename", 0); + if (fd < 0) { + fail("memfd_create()"); + return 1; + } + + if (write(fd, script, script_len) != script_len) { + fail("write(memfd)"); + return 1; + } + + if (!test_exec_fd(fd)) + return 1; + + test_msg("execveat(memfd) succeeded before C/R.\n"); + + test_daemon(); + test_waitsig(); + + if (!test_exec_fd(fd)) + return 1; + + pass(); + + return 0; +} diff --git a/test/zdtm/static/memfd04.desc b/test/zdtm/static/memfd04.desc new file mode 100644 index 0000000000..bbf136d145 --- /dev/null +++ b/test/zdtm/static/memfd04.desc @@ -0,0 +1 @@ +{'deps': ['/bin/true']} From 959a32da5f0af7348570d4626c3ceb929ccd4dab Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 29 Aug 2023 11:09:59 +0800 Subject: [PATCH 133/321] CONTRIBUTING.md: don't mention ctags Ctags is mentioned in the beginning of the "Edit the source code" which is really confusing: Do you need ctags to edit CRIU code? - No. It is just one helpful tool to browse the code, and we do not want to enforce it. So, what is it doing in contribution guide? People who really need it should be able to find it in Makefile or just write oneliner of their own to collect tags... Signed-off-by: Pavel Tikhomirov --- CONTRIBUTING.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 87da08b343..3cd74128ed 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,12 +46,6 @@ This should create the `./criu/criu` executable. ## Edit the source code -If you use ctags, you can generate the ctags file by running - -``` - make tags -``` - When you change the source code, please keep in mind the following code conventions: * we prefer tabs and indentations to be 8 characters width From 75146b02c9fa64b748aaa162efe5c0796c0b689c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 29 Aug 2023 12:10:59 +0800 Subject: [PATCH 134/321] CONTRIBUTING.md: improve coding-style related sections This is highlight that code readability is the real goal of all the coding-style rules. We should not do coding-style just for coding-style, e.g. when clang-format suggests crazy formating we should not follow it if we feel it is bad. Signed-off-by: Pavel Tikhomirov --- CONTRIBUTING.md | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3cd74128ed..a70506bfbf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,11 +48,16 @@ This should create the `./criu/criu` executable. When you change the source code, please keep in mind the following code conventions: +* code is written to be read, so the code readability is the most important thing you need to have in mind when preparing patches * we prefer tabs and indentations to be 8 characters width -* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community. +* we prefer line length of 80 characters or less, more is allowed if it helps with code readability +* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community -Other conventions can be learned from the source code itself. In short, make sure your new code -looks similar to what is already there. +Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. + +## Automatic tools to fix coding-style + +Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. The following command can be used to automatically run a code linter for Python files (flake8), Shell scripts (shellcheck), text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). @@ -84,6 +89,41 @@ to check the last *N* commits for formatting errors, without applying the change Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected we need to review the suggested changes and decide if they should be fixed before merging. +Here are some bad examples of clang-format-ing: + +* if clang-format tries to force 120 characters and breaks readability - it is wrong: + +``` +@@ -58,8 +59,7 @@ static int register_membarriers(void) + } + + if (!all_ok) { +- fail("can't register membarrier()s - tried %#x, kernel %#x", +- barriers_registered, barriers_supported); ++ fail("can't register membarrier()s - tried %#x, kernel %#x", barriers_registered, barriers_supported); + return -1; + } +``` + +* if clang-format breaks your beautiful readability friendly alignment in structures, comments or defines - it is wrong: + +``` +--- a/test/zdtm/static/membarrier.c ++++ b/test/zdtm/static/membarrier.c +@@ -27,9 +27,10 @@ static const struct { + int register_cmd; + int execute_cmd; + } membarrier_cmds[] = { +- { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, +- { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, +- { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, ++ { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, ++ { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, ++ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, ++ { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + }; +``` + ## Test your changes CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run From 03541c0e6dac74cfdfd8876c06615e51ff03c603 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 24 Aug 2023 13:34:05 +0800 Subject: [PATCH 135/321] lint: don't fail workflow on indent fail There are multiple cases where good human readable code block is converted to an unreadable mess by clang-format, so we don't want to rely on clang-format completely. Also there is no way, as far as I can see, to make clang-format only fix what we want it to fix without breaking something. So let's just display hints inline where clang-format is unhappy. When reviewer sees such a warning it's a good sign that something is broken in coding-style around this warning. We add special script which parses diff generated by indent and generates warning for each hunk. Signed-off-by: Pavel Tikhomirov --- .github/workflows/lint.yml | 18 ++++++++--------- Makefile | 1 + scripts/github-indent-warnings.py | 33 +++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 9 deletions(-) create mode 100755 scripts/github-indent-warnings.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e18f921f3e..f52bce8123 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -26,15 +26,15 @@ jobs: run: make lint - name: Run make indent - run: > + continue-on-error: true + run: | if [ -z "${{github.base_ref}}" ]; then - git fetch --deepen=1 && - if ! make indent OPTS=--diff; then - exit 1 - fi + git fetch --deepen=1 + make indent else - git fetch origin ${{github.base_ref}} && - if ! make indent OPTS=--diff BASE=origin/${{github.base_ref}}; then - exit 1 - fi + git fetch origin ${{github.base_ref}} + make indent BASE=origin/${{github.base_ref}} fi + - name: Raise in-line make indent warnings + run: | + git diff | ./scripts/github-indent-warnings.py diff --git a/Makefile b/Makefile index e4ddd887fe..8f2c294d5a 100644 --- a/Makefile +++ b/Makefile @@ -442,6 +442,7 @@ lint: flake8 --config=scripts/flake8.cfg crit/setup.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump + flake8 --config=scripts/flake8.cfg scripts/github-indent-warnings.py shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install diff --git a/scripts/github-indent-warnings.py b/scripts/github-indent-warnings.py new file mode 100755 index 0000000000..04f82d6c11 --- /dev/null +++ b/scripts/github-indent-warnings.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +import sys +import re + +re_file = r'^diff --git a/(\S\S*)\s.*$' +re_line = r'^@@ -(\d\d*)\D.*@@.*$' + +if __name__ == '__main__': + if len(sys.argv) != 1 and len(sys.argv) != 2: + print(f'usage: {sys.argv[0]} ') + print(f'usage: | {sys.argv[0]}') + exit(1) + + input_file = sys.stdin.fileno() + if len(sys.argv) == 2: + input_file = sys.argv[1] + + with open(input_file, 'r') as fi: + file_name = None + line_number = None + for line in fi: + file_matches = re.findall(re_file, line) + if len(file_matches) == 1: + file_name = file_matches[0] + continue + + if file_name is None: + continue + + line_matches = re.findall(re_line, line) + if len(line_matches) == 1: + line_number = int(line_matches[0]) + 3 + print(f'::warning file={file_name},line={line_number}::clang-format: Possible coding style problem (https://github.com/checkpoint-restore/criu/blob/criu-dev/CONTRIBUTING.md#automatic-tools-to-fix-coding-style)') From 1df618a3e1d846d3e3878ffd7c7d6144774ca6e2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 20 May 2023 08:45:50 +0100 Subject: [PATCH 136/321] vagrant: update to version 2.3.7 This patch also updated the download URL format from https://releases.hashicorp.com/vagrant/2.3.7/vagrant_2.3.7_x86_64.deb to https://releases.hashicorp.com/vagrant/2.3.7/vagrant_2.3.7-1_amd64.deb Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index ac4b5579d5..e26b5d786a 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,7 +6,7 @@ set -e set -x -VAGRANT_VERSION=2.2.19 +VAGRANT_VERSION=2.3.7 FEDORA_VERSION=37 FEDORA_BOX_VERSION=37.20221105.0 @@ -19,7 +19,7 @@ setup() { # Tar up the git checkout to have vagrant rsync it to the VM tar cf criu.tar ../../../criu # Cirrus has problems with the following certificate. - wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_"$(uname -m)".deb -O /tmp/vagrant.deb && \ + wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ From 82bfb67abe6eab7aa2d6fedc30e34647beaf8c7a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 20 May 2023 08:47:04 +0100 Subject: [PATCH 137/321] vagrant: run tests with fedora 38 Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index e26b5d786a..328903f385 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -7,8 +7,8 @@ set -e set -x VAGRANT_VERSION=2.3.7 -FEDORA_VERSION=37 -FEDORA_BOX_VERSION=37.20221105.0 +FEDORA_VERSION=38 +FEDORA_BOX_VERSION=38.20230413.1 setup() { if [ -n "$TRAVIS" ]; then From 4b7287be9b55f4651bd32587ab8446966b4bdd6c Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Fri, 8 Sep 2023 23:34:53 +0200 Subject: [PATCH 138/321] dump: use MEMBARRIER_CMD_GET_REGISTRATIONS when available MEMBARRIER_CMD_GET_REGISTRATIONS can tell us whether or not the process used MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED unlike the old probing method. Falls back to the old method when MEMBARRIER_CMD_GET_REGISTRATIONS is unavailable. Signed-off-by: Michal Clapinski --- criu/include/kerndat.h | 1 + criu/include/parasite.h | 2 ++ criu/kerndat.c | 23 +++++++++++++++++++++ criu/parasite-syscall.c | 1 + criu/pie/parasite.c | 46 +++++++++++++++++++++++++++++------------ 5 files changed, 60 insertions(+), 13 deletions(-) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 0b2f715f38..f5d409acbf 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -85,6 +85,7 @@ struct kerndat_s { bool has_ptrace_get_rseq_conf; struct __ptrace_rseq_configuration libc_rseq_conf; bool has_ipv6_freebind; + bool has_membarrier_get_registrations; }; extern struct kerndat_s kdat; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 5209b6da22..1244220f67 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -118,6 +118,8 @@ static inline int posix_timers_dump_size(int timer_n) */ struct parasite_dump_misc { + bool has_membarrier_get_registrations; /* this is sent from criu to parasite. */ + unsigned long brk; u32 pid; diff --git a/criu/kerndat.c b/criu/kerndat.c index 37b265d8de..fef5a46c19 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -17,6 +17,7 @@ #include #include #include +#include #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include @@ -1636,6 +1637,24 @@ static int kerndat_has_ipv6_freebind(void) return ret; } +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int kerndat_has_membarrier_get_registrations(void) +{ + int ret = syscall(__NR_membarrier, 1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0); + if (ret < 0) { + if (errno != EINVAL) { + return ret; + } + + kdat.has_membarrier_get_registrations = false; + } else { + kdat.has_membarrier_get_registrations = true; + } + + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1879,6 +1898,10 @@ int kerndat_init(void) pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_membarrier_get_registrations()) { + pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index c08ed09b18..295e404ec5 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -433,6 +433,7 @@ int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_mis struct parasite_dump_misc *ma; ma = compel_parasite_args(ctl, struct parasite_dump_misc); + ma->has_membarrier_get_registrations = kdat.has_membarrier_get_registrations; if (compel_rpc_call_sync(PARASITE_CMD_DUMP_MISC, ctl) < 0) return -1; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index c0604903b9..e151ed6563 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -246,6 +246,27 @@ static int get_membarrier_registration_mask(int cmd_bit) #define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED 3 #define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE 5 #define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ 7 +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int dump_membarrier_compat(int *membarrier_registration_mask) +{ + int ret; + + *membarrier_registration_mask = 0; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + return 0; +} static int dump_misc(struct parasite_dump_misc *args) { @@ -261,19 +282,18 @@ static int dump_misc(struct parasite_dump_misc *args) args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); - args->membarrier_registration_mask = 0; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); - if (ret < 0) - return -1; - args->membarrier_registration_mask |= ret; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); - if (ret < 0) - return -1; - args->membarrier_registration_mask |= ret; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); - if (ret < 0) - return -1; - args->membarrier_registration_mask |= ret; + if (args->has_membarrier_get_registrations) { + ret = sys_membarrier(1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + pr_err("membarrier(1 << %d) returned %d\n", MEMBARRIER_CMDBIT_GET_REGISTRATIONS, ret); + return -1; + } + args->membarrier_registration_mask = ret; + } else { + ret = dump_membarrier_compat(&args->membarrier_registration_mask); + if (ret) + return ret; + } ret = sys_prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&args->child_subreaper, 0, 0, 0); if (ret) From d7524792db48716bb9005637c8e1280d1792b07e Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 13 Sep 2023 18:42:58 +0200 Subject: [PATCH 139/321] zdtm: test MEMBARRIER_CMD_GLOBAL_EXPEDITED migration Check membarrier registration both ways: 1. By issuing membarrier commands and checking if they succeed. 2. By issuing MEMBARRIER_CMD_GET_REGISTRATIONS. The first way is needed for older kernels. The second way is needed to test MEMBARRIER_CMD_GLOBAL_EXPEDITED. Signed-off-by: Michal Clapinski --- test/zdtm/static/membarrier.c | 51 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/test/zdtm/static/membarrier.c b/test/zdtm/static/membarrier.c index a04b360351..85d705ba7b 100644 --- a/test/zdtm/static/membarrier.c +++ b/test/zdtm/static/membarrier.c @@ -8,14 +8,16 @@ const char *test_author = "MichaÅ‚ MirosÅ‚aw "; /* * Define membarrier() CMDs to avoid depending on exact kernel header version. - * FIXME: use MEMBARRIER_CMD_GET_REGISTRATIONS if supported by kernel. */ +#define MEMBARRIER_CMD_GLOBAL_EXPEDITED (1 << 1) +#define MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED (1 << 2) #define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) #define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE (1 << 5) #define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE (1 << 6) #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) #define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) +#define MEMBARRIER_CMD_GET_REGISTRATIONS (1 << 9) static int membarrier(int cmd, unsigned int flags, int cpu_id) { @@ -27,9 +29,14 @@ static const struct { int register_cmd; int execute_cmd; } membarrier_cmds[] = { - { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, - { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, - { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + { "GLOBAL_EXPEDITED", MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_GLOBAL_EXPEDITED }, + { "PRIVATE_EXPEDITED", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_PRIVATE_EXPEDITED }, + { "PRIVATE_EXPEDITED_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, + { "PRIVATE_EXPEDITED_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, }; static const int n_membarrier_cmds = sizeof(membarrier_cmds) / sizeof(*membarrier_cmds); @@ -49,10 +56,10 @@ static int register_membarriers(void) if (~barriers_supported & membarrier_cmds[i].register_cmd) continue; - barriers_registered |= membarrier_cmds[i].execute_cmd; + barriers_registered |= membarrier_cmds[i].register_cmd; if (membarrier(membarrier_cmds[i].register_cmd, 0, 0) < 0) { - pr_perror("membarrier(REGISTER_PRIVATE_EXPEDITED%s)", membarrier_cmds[i].name_suffix); + pr_perror("membarrier(REGISTER_%s)", membarrier_cmds[i].name_suffix); all_ok = false; } } @@ -71,15 +78,15 @@ static int register_membarriers(void) return barriers_registered; } -static bool check_membarriers(int barriers_registered) +static bool check_membarriers_compat(int barriers_registered) { bool all_ok = true; for (int i = 0; i < n_membarrier_cmds; ++i) { - if (~barriers_registered & membarrier_cmds[i].execute_cmd) + if (~barriers_registered & membarrier_cmds[i].register_cmd) continue; if (membarrier(membarrier_cmds[i].execute_cmd, 0, 0) < 0) { - pr_perror("membarrier(PRIVATE_EXPEDITED%s)", membarrier_cmds[i].name_suffix); + pr_perror("membarrier(%s)", membarrier_cmds[i].name_suffix); all_ok = false; } } @@ -90,6 +97,32 @@ static bool check_membarriers(int barriers_registered) return all_ok; } +static bool check_membarriers_get_registrations(int barriers_registered) +{ + int ret = membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + if (errno == EINVAL) { + test_msg("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS) not supported by running kernel"); + return true; + } + fail("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS)"); + return false; + } + if (ret != barriers_registered) { + fail("MEMBARRIER_CMD_GET_REGISTRATIONS check failed, expected: %d, got: %d", + barriers_registered, ret); + return false; + } + + return true; +} + +static bool check_membarriers(int barriers_registered) +{ + return check_membarriers_compat(barriers_registered) && + check_membarriers_get_registrations(barriers_registered); +} + int main(int argc, char **argv) { int barriers_registered; From f043f53cff6b50af15cf642e3916761bd2ffadb8 Mon Sep 17 00:00:00 2001 From: David Francis Date: Tue, 25 Apr 2023 09:39:28 -0400 Subject: [PATCH 140/321] criu/plugin: Add environment variable to cap size of buffers. The amdgpu plugin would create a memory buffer at the size of the largest VRAM bo (buffer object). On some systems, VRAM size exceeds RAM size, so the largest bo might be larger than the available memory. Add an environment variable KFD_MAX_BUFFER_SIZE, which caps the size of this buffer. By default, it is set to 0, and has no effect. When active, any bo larger than its value will be saved to/restored from file in multiple passes. Signed-off-by: David Francis --- Documentation/criu-amdgpu-plugin.txt | 9 + plugins/amdgpu/amdgpu_plugin.c | 332 ++++++++++++++++----------- 2 files changed, 202 insertions(+), 139 deletions(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 48a8e2f6d1..35321a9159 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -97,6 +97,15 @@ executing criu command. E.g: KFD_CAPABILITY_CHECK=1 +*KFD_MAX_BUFFER_SIZE*:: + On some systems, VRAM sizes may exceed RAM sizes, and so buffers for dumping + and restoring VRAM may be unable to fit. Set to a nonzero value (in bytes) + to set a limit on the plugin's memory usage. + Default:0 (Disabled) + + E.g: + KFD_MAX_BUFFER_SIZE="2G" + AUTHOR ------ diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 6397ecdb74..6a79f8b19d 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -107,6 +107,8 @@ extern bool kfd_vram_size_check; extern bool kfd_numa_check; extern bool kfd_capability_check; +size_t kfd_max_buffer_size; + /**************************************************************************************************/ int write_fp(FILE *fp, const void *buf, const size_t buf_len) @@ -449,6 +451,48 @@ void getenv_bool(const char *var, bool *value) pr_info("param: %s:%s\n", var, *value ? "Y" : "N"); } +void getenv_size_t(const char *var, size_t *value) +{ + char *value_str = getenv(var); + char *endp = value_str; + int sh = 0; + size_t size; + + pr_info("Value str: %s\n", value_str); + + if (value_str) { + size = (size_t)strtoul(value_str, &endp, 0); + if (errno || value_str == endp) { + pr_err("Ignoring invalid value for %s=%s, expecting a positive integer\n", var, value_str); + return; + } + switch (*endp) { + case 'k': + case 'K': + sh = 10; + break; + case 'M': + sh = 20; + break; + case 'G': + sh = 30; + break; + case '\0': + sh = 0; + break; + default: + pr_err("Ignoring invalid size suffix for %s=%s, expecting 'K'/k', 'M', or 'G'\n", var, value_str); + return; + } + if (SIZE_MAX >> sh < size) { + pr_err("Ignoring invalid value for %s=%s, exceeds SIZE_MAX\n", var, value_str); + return; + } + *value = size << sh; + } + pr_info("param: %s:0x%lx\n", var, *value); +} + int amdgpu_plugin_init(int stage) { pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); @@ -476,6 +520,9 @@ int amdgpu_plugin_init(int stage) getenv_bool("KFD_NUMA_CHECK", &kfd_numa_check); getenv_bool("KFD_CAPABILITY_CHECK", &kfd_capability_check); } + kfd_max_buffer_size = 0; + getenv_size_t("KFD_MAX_BUFFER_SIZE", &kfd_max_buffer_size); + return 0; } @@ -607,16 +654,14 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type) +int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) { - uint64_t size, gpu_addr_src, gpu_addr_dest, gpu_addr_ib; - uint64_t gpu_addr_src_orig, gpu_addr_dest_orig; - amdgpu_va_handle h_va_src, h_va_dest, h_va_ib; - amdgpu_bo_handle h_bo_src, h_bo_dest, h_bo_ib; + uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; + uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; + amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; + amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; struct amdgpu_bo_import_result res = { 0 }; - uint64_t copy_size, bytes_remain, j = 0; - uint64_t n_packets; struct amdgpu_cs_ib_info ib_info; amdgpu_bo_list_handle h_bo_list; struct amdgpu_cs_request cs_req; @@ -625,102 +670,100 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am uint32_t expired; amdgpu_context_handle h_ctx; uint32_t *ib = NULL; - int err, shared_fd; + int j, err, shared_fd, packets_per_buffer; - shared_fd = bo_buckets[i].dmabuf_fd; - size = bo_buckets[i].size; + shared_fd = bo_bucket.dmabuf_fd; + size = bo_bucket.size; + buffer_bo_size = min(size, buffer_size); + packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; + src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; + dst_bo_size = (type == SDMA_OP_VRAM_READ) ? buffer_bo_size : size; plugin_log_msg("Enter %s\n", __func__); /* prepare src buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_src); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, src_bo_size, &h_bo_src); if (err) { pr_perror("failed to create userptr for sdma"); return -EFAULT; } - break; - case SDMA_OP_VRAM_READ: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); return -EFAULT; } - h_bo_src = res.buf_handle; break; - default: pr_perror("Invalid sdma operation"); return -EINVAL; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_src, &h_va_src, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, src_bo_size, 0x1000, 0, &gpu_addr_src, + &h_va_src, 0); if (err) { pr_perror("failed to alloc VA for src bo"); goto err_src_va; } - err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_src, 0, src_bo_size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the src BO"); goto err_src_bo_map; } - plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, size); + plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, src_bo_size); + /* prepare dest buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - - h_bo_dest = res.buf_handle; + h_bo_dst = res.buf_handle; break; - case SDMA_OP_VRAM_READ: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_dest); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, dst_bo_size, &h_bo_dst); if (err) { pr_perror("failed to create userptr for sdma"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } break; - default: pr_perror("Invalid sdma operation"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_dest, &h_va_dest, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, dst_bo_size, 0x1000, 0, &gpu_addr_dst, + &h_va_dst, 0); if (err) { pr_perror("failed to alloc VA for dest bo"); - goto err_dest_va; + goto err_dst_va; } - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, dst_bo_size, gpu_addr_dst, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the dest BO"); - goto err_dest_bo_map; + goto err_dst_bo_map; } - plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dest, size); + plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dst, dst_bo_size); - n_packets = (size + max_copy_size) / max_copy_size; /* prepare ring buffer/indirect buffer for command submission * each copy packet is 7 dwords so we need to alloc 28x size for ib */ - err = alloc_and_map(h_dev, n_packets * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, + err = alloc_and_map(h_dev, packets_per_buffer * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, (void **)&ib); if (err) { pr_perror("failed to allocate and map ib/rb"); goto err_ib_gpu_alloc; } - - plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, n_packets * 28); + plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, packets_per_buffer * 28); resources[0] = h_bo_src; - resources[1] = h_bo_dest; + resources[1] = h_bo_dst; resources[2] = h_bo_ib; err = amdgpu_bo_list_create(h_dev, 3, resources, NULL, &h_bo_list); if (err) { @@ -728,103 +771,122 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am goto err_bo_list; } - memset(&cs_req, 0, sizeof(cs_req)); - memset(&fence, 0, sizeof(fence)); - memset(ib, 0, n_packets * 28); - - plugin_log_msg("setting up sdma packets for command submission\n"); bytes_remain = size; - gpu_addr_src_orig = gpu_addr_src; - gpu_addr_dest_orig = gpu_addr_dest; + if (type == SDMA_OP_VRAM_WRITE) + copy_dst = gpu_addr_dst; + else + copy_src = gpu_addr_src; + while (bytes_remain > 0) { - copy_size = min(bytes_remain, max_copy_size); - - ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); - ib[j++] = copy_size; - ib[j++] = 0; - ib[j++] = 0xffffffff & gpu_addr_src; - ib[j++] = (0xffffffff00000000 & gpu_addr_src) >> 32; - ib[j++] = 0xffffffff & gpu_addr_dest; - ib[j++] = (0xffffffff00000000 & gpu_addr_dest) >> 32; - - gpu_addr_src += copy_size; - gpu_addr_dest += copy_size; - bytes_remain -= copy_size; - } - - gpu_addr_src = gpu_addr_src_orig; - gpu_addr_dest = gpu_addr_dest_orig; - plugin_log_msg("pad the IB to align on 8 dw boundary\n"); - /* pad the IB to the required number of dw with SDMA_NOP */ - while (j & 7) - ib[j++] = SDMA_NOP; - - ib_info.ib_mc_address = gpu_addr_ib; - ib_info.size = j; - - cs_req.ip_type = AMDGPU_HW_IP_DMA; - /* possible future optimization: may use other rings, info available in - * amdgpu_query_hw_ip_info() - */ - cs_req.ring = 0; - cs_req.number_of_ibs = 1; - cs_req.ibs = &ib_info; - cs_req.resources = h_bo_list; - cs_req.fence_info.handle = NULL; - - plugin_log_msg("create the context\n"); - err = amdgpu_cs_ctx_create(h_dev, &h_ctx); - if (err) { - pr_perror("failed to create context for SDMA command submission"); - goto err_ctx; - } + memset(&cs_req, 0, sizeof(cs_req)); + memset(&fence, 0, sizeof(fence)); + memset(ib, 0, packets_per_buffer * 28); + + if (type == SDMA_OP_VRAM_WRITE) { + err = read_fp(storage_fp, buffer, min(bytes_remain, buffer_bo_size)); + if (err) { + pr_perror("failed to read from storage"); + goto err_bo_list; + } + } - plugin_log_msg("initiate sdma command submission\n"); - err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); - if (err) { - pr_perror("failed to submit command for SDMA IB"); - goto err_cs_submit_ib; - } + buffer_space_remain = buffer_bo_size; + if (type == SDMA_OP_VRAM_WRITE) + copy_src = gpu_addr_src; + else + copy_dst = gpu_addr_dst; + j = 0; + + while (bytes_remain > 0 && buffer_space_remain > 0) { + copy_size = min(min(bytes_remain, max_copy_size), buffer_space_remain); + + ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); + ib[j++] = copy_size; + ib[j++] = 0; + ib[j++] = 0xffffffff & copy_src; + ib[j++] = (0xffffffff00000000 & copy_src) >> 32; + ib[j++] = 0xffffffff & copy_dst; + ib[j++] = (0xffffffff00000000 & copy_dst) >> 32; + + copy_src += copy_size; + copy_dst += copy_size; + bytes_remain -= copy_size; + buffer_space_remain -= copy_size; + } + /* pad the IB to the required number of dw with SDMA_NOP */ + while (j & 7) + ib[j++] = SDMA_NOP; - fence.context = h_ctx; - fence.ip_type = AMDGPU_HW_IP_DMA; - fence.ip_instance = 0; - fence.ring = 0; - fence.fence = cs_req.seq_no; - err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); - if (err) { - pr_perror("failed to query fence status"); - goto err_cs_submit_ib; - } + ib_info.ib_mc_address = gpu_addr_ib; + ib_info.size = j; - if (!expired) { - pr_err("IB execution did not complete\n"); - err = -EBUSY; - goto err_cs_submit_ib; - } + cs_req.ip_type = AMDGPU_HW_IP_DMA; + /* possible future optimization: may use other rings, info available in + * amdgpu_query_hw_ip_info() + */ + cs_req.ring = 0; + cs_req.number_of_ibs = 1; + cs_req.ibs = &ib_info; + cs_req.resources = h_bo_list; + cs_req.fence_info.handle = NULL; - plugin_log_msg("done querying fence status\n"); + err = amdgpu_cs_ctx_create(h_dev, &h_ctx); + if (err) { + pr_perror("failed to create context for SDMA command submission"); + goto err_ctx; + } + err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); + if (err) { + pr_perror("failed to submit command for SDMA IB"); + goto err_cs_submit_ib; + } + + fence.context = h_ctx; + fence.ip_type = AMDGPU_HW_IP_DMA; + fence.ip_instance = 0; + fence.ring = 0; + fence.fence = cs_req.seq_no; + err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); + if (err) { + pr_perror("failed to query fence status"); + goto err_cs_submit_ib; + } + if (!expired) { + pr_err("IB execution did not complete\n"); + err = -EBUSY; + goto err_cs_submit_ib; + } + + if (type == SDMA_OP_VRAM_READ) { + err = write_fp(storage_fp, buffer, buffer_bo_size - buffer_space_remain); + if (err) { + pr_perror("failed to write out to storage"); + goto err_cs_submit_ib; + } + } err_cs_submit_ib: - amdgpu_cs_ctx_free(h_ctx); + amdgpu_cs_ctx_free(h_ctx); + if (err) + break; + } err_ctx: amdgpu_bo_list_destroy(h_bo_list); err_bo_list: - free_and_unmap(n_packets * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); + free_and_unmap(packets_per_buffer * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); err_ib_gpu_alloc: - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_UNMAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, size, gpu_addr_dst, 0, AMDGPU_VA_OP_UNMAP); if (err) - pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dest, size); -err_dest_bo_map: - err = amdgpu_va_range_free(h_va_dest); + pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dst, size); +err_dst_bo_map: + err = amdgpu_va_range_free(h_va_dst); if (err) pr_perror("dest range free failed"); -err_dest_va: - err = amdgpu_bo_free(h_bo_dest); +err_dst_va: + err = amdgpu_bo_free(h_bo_dst); if (err) pr_perror("dest bo free failed"); - -err_dest_bo_prep: +err_dst_bo_prep: err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_UNMAP); if (err) pr_perror("failed to GPU unmap the src BO %lx, size = %lx", gpu_addr_src, size); @@ -836,7 +898,6 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am err = amdgpu_bo_free(h_bo_src); if (err) pr_perror("src bo free failed"); - plugin_log_msg("Leaving sdma_copy_bo, err = %d\n", err); return err; } @@ -845,10 +906,9 @@ void *dump_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - BoEntry **bo_info = thread_data->bo_entries; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; - size_t max_bo_size = 0, image_size = 0; + size_t max_bo_size = 0, image_size = 0, buffer_size; uint64_t max_copy_size; uint32_t major, minor; int num_bos = 0; @@ -884,10 +944,11 @@ void *dump_bo_contents(void *_thread_data) } } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } @@ -910,15 +971,12 @@ void *dump_bo_contents(void *_thread_data) num_bos++; /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_READ); + ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_READ); if (ret) { pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); break; } - plugin_log_msg("** Successfully drained the BO using sDMA: bo_buckets[%d] **\n", i); - ret = write_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - break; } exit: @@ -939,8 +997,7 @@ void *restore_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - size_t image_size = 0, total_bo_size = 0, max_bo_size = 0; - BoEntry **bo_info = thread_data->bo_entries; + size_t image_size = 0, total_bo_size = 0, max_bo_size = 0, buffer_size; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; uint64_t max_copy_size; @@ -977,7 +1034,6 @@ void *restore_bo_contents(void *_thread_data) goto exit; } - /* Allocate buffer to fit biggest BO */ for (i = 0; i < thread_data->num_of_bos; i++) { if (bo_buckets[i].gpu_id == thread_data->gpu_id && (bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) { @@ -995,10 +1051,11 @@ void *restore_bo_contents(void *_thread_data) goto exit; } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } @@ -1012,11 +1069,8 @@ void *restore_bo_contents(void *_thread_data) num_bos++; - ret = read_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - goto exit; - - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE); if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); break; From 4766ffa197cbb700a89dff4d19eae8807245b276 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Thu, 31 Aug 2023 14:37:49 -0400 Subject: [PATCH 141/321] compel: Add support for ppc64le scv syscalls Power ISA 3.0 added a new syscall instruction. Kernel 5.9 added corresponding support. Add CRIU support to recognize the new instruction and kernel ABI changes to properly dump and restore threads executing in syscalls. Without this change threads executing in syscalls using the scv instruction will not be restored to re-execute the syscall, they will be restored to execute the following instruction and will return unexpected error codes (ERESTARTSYS, etc) to user code. Signed-off-by: Younes Manton --- compel/arch/ppc64/src/lib/infect.c | 68 +++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index db999ce37f..1603ac92e5 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -11,6 +11,7 @@ #include "log.h" #include "common/bug.h" #include "common/page.h" +#include "common/err.h" #include "infect.h" #include "infect-priv.h" @@ -303,33 +304,58 @@ static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) return -1; /* still failing the checkpoint */ } -static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) -{ - pr_info("Dumping GP/FPU registers for %d\n", pid); +/* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ - /* - * This is inspired by kernel function check_syscall_restart in - * arch/powerpc/kernel/signal.c - */ #ifndef TRAP #define TRAP(r) ((r).trap & ~0xF) #endif - if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { - /* Restart the system call */ - switch (regs->gpr[3]) { - case ERESTARTNOHAND: - case ERESTARTSYS: - case ERESTARTNOINTR: - regs->gpr[3] = regs->orig_gpr3; - regs->nip -= 4; - break; - case ERESTART_RESTARTBLOCK: - pr_warn("Will restore %d with interrupted system call\n", pid); - regs->gpr[3] = EINTR; - break; - } +static bool trap_is_scv(user_regs_struct_t *regs) +{ + return TRAP(*regs) == 0x3000; +} + +static bool trap_is_syscall(user_regs_struct_t *regs) +{ + return trap_is_scv(regs) || TRAP(*regs) == 0x0C00; +} + +static void handle_syscall(pid_t pid, user_regs_struct_t *regs) +{ + unsigned long ret = regs->gpr[3]; + + if (trap_is_scv(regs)) { + if (!IS_ERR_VALUE(ret)) + return; + ret = -ret; + } else if (!(regs->ccr & 0x10000000)) { + return; + } + + /* Restart or interrupt the system call */ + switch (ret) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->gpr[3] = trap_is_scv(regs) ? -EINTR : EINTR; + break; } +} + +static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (trap_is_syscall(regs)) + handle_syscall(pid, regs); /* Resetting trap since we are now coming from user space. */ regs->trap = 0; From 4ae1518df81132314a7ae3e1c5797d147f0dde24 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 25 Apr 2023 16:20:41 +0100 Subject: [PATCH 142/321] lib/pycriu: generate version.py The version of CRIU is specified in the Makefile.versions file. This patch generates '__varion__' value for the pycriu module. This value can be used by crit to implement `--version`. Signed-off-by: Radostin Stoyanov --- lib/py/.gitignore | 1 + lib/py/Makefile | 7 +++++-- lib/py/__init__.py | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/py/.gitignore b/lib/py/.gitignore index d3090fca32..fba7e38649 100644 --- a/lib/py/.gitignore +++ b/lib/py/.gitignore @@ -1,2 +1,3 @@ *_pb2.py *.pyc +version.py diff --git a/lib/py/Makefile b/lib/py/Makefile index 691b6bdd33..5ce9bc8f7e 100644 --- a/lib/py/Makefile +++ b/lib/py/Makefile @@ -1,4 +1,4 @@ -all-y += libpy-images rpc_pb2.py +all-y += libpy-images rpc_pb2.py version.py $(obj)/images/Makefile: ; $(obj)/images/%: .FORCE @@ -11,7 +11,10 @@ libpy-images: rpc_pb2.py: $(Q) protoc -I=images/ --python_out=$(obj) images/$(@:_pb2.py=.proto) -cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc) +version.py: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $(obj)/$@ + +cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc version.py) clean-lib-py: $(Q) $(MAKE) $(build)=$(obj)/images clean diff --git a/lib/py/__init__.py b/lib/py/__init__.py index 96b3e9526c..44f66ffa42 100644 --- a/lib/py/__init__.py +++ b/lib/py/__init__.py @@ -1,3 +1,4 @@ from . import rpc_pb2 as rpc from . import images from .criu import * +from .version import __version__ From b8b2fe6b7c48d6b5c346670d64aace645778cef7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 25 Apr 2023 16:38:44 +0100 Subject: [PATCH 143/321] crit/setup.py: use __version__ from pycriu Signed-off-by: Radostin Stoyanov --- crit/setup.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/crit/setup.py b/crit/setup.py index 1aaa73a130..2f584678fe 100644 --- a/crit/setup.py +++ b/crit/setup.py @@ -1,23 +1,9 @@ -import os from setuptools import setup, find_packages - - -def get_version(): - version = '0.0.1' - env = os.environ - if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: - version = '{}.{}'.format( - env['CRIU_VERSION_MAJOR'], - env['CRIU_VERSION_MINOR'] - ) - if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: - version += '.' + env['CRIU_VERSION_SUBLEVEL'] - return version - +import pycriu setup( name='crit', - version=get_version(), + version=pycriu.__version__, description='CRiu Image Tool', author='CRIU team', author_email='criu@openvz.org', From 150eecc0f8f1b4ee8c0bdc1a902141d103732ee1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 25 Apr 2023 16:41:24 +0100 Subject: [PATCH 144/321] py/cli: add --version option This patch implements the '--version' for the crit tool. $ crit --version 3.17 Signed-off-by: Radostin Stoyanov --- lib/py/cli.py | 1 + test/others/crit/test.sh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/lib/py/cli.py b/lib/py/cli.py index 594035d27c..a3a0870f85 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -364,6 +364,7 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--version', action='version', version=pycriu.__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 105aac72b4..2698bbd3c2 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -101,6 +101,8 @@ function run_test2 { ${CRIT} x ./ rss || exit 1 } +${CRIT} --version + gen_imgs run_test1 run_test2 From 5e37ccf379ad7f47a720f64bced40cf2fe5bd6a2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 23 Sep 2023 08:50:05 -0700 Subject: [PATCH 145/321] ci: stop testing ubuntu overlayfs They break it with each kernel rebase. More details are here: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 Last time, it was fixed a few month ago and it has been broken again in 5.15.0-1046-azure. Let's bind-mount the CRIU directory into a test container to make it independent of a container file system. Signed-off-by: Andrei Vagin --- scripts/ci/Makefile | 35 +++++------------------------------ scripts/ci/asan.sh | 3 +++ 2 files changed, 8 insertions(+), 30 deletions(-) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index ce844a17ce..1caa1e4235 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -20,14 +20,6 @@ export CONTAINER_RUNTIME alpine: ZDTM_OPTS=-x zdtm/static/binfmt_misc -x zdtm/static/sched_policy00 -define DOCKER_JSON -{ - "storage-driver": "devicemapper" -} -endef - -export DOCKER_JSON - ifeq ($(GITHUB_ACTIONS),true) # GitHub Actions does not give us a real TTY and errors out with # 'the input device is not a TTY' if using '-t' @@ -47,34 +39,20 @@ else endif ifeq ($(CONTAINER_RUNTIME),podman) - # Just as Docker needs to use devicemapper Podman needs vfs - # as graphdriver as overlayfs does not support all test cases - STORAGE_DRIVER := vfs # Podman limits the number of processes in a container using cgroups. # Disable it as it breaks the thread-bomb test CONTAINER_OPTS += --pids-limit=0 endif -export STORAGE_DRIVER - -restart-docker: - if [ "$$UNAME" = "x86_64" ] && [ "$$CONTAINER_RUNTIME" = "docker" ]; then \ - echo "$$DOCKER_JSON" > /etc/docker/daemon.json; \ - cat /etc/docker/daemon.json; \ - systemctl status docker; \ - systemctl restart docker; \ - systemctl status docker; \ - fi - export ZDTM_OPTS -$(TARGETS): restart-docker +$(TARGETS): $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run --env-file docker.env $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh + $(CONTAINER_RUNTIME) run --env-file docker.env -v `pwd`/../../:/criu $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh -fedora-asan: restart-docker +fedora-asan: $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) + $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) -v `pwd`/../../:/criu criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) docker-test: ./docker-test.sh @@ -82,10 +60,7 @@ docker-test: podman-test: ./podman-test.sh -# overlayfs behaves differently on Ubuntu and breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 -# Switch to devicemapper -java-test: restart-docker +java-test: ./java-test.sh setup-vagrant: diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index deeeca0b9d..8b72fa5f1a 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -4,6 +4,9 @@ set -x cat /proc/self/mountinfo +time make ASAN=1 -j 4 V=1 +time make -j4 -C test/zdtm V=1 + chmod 0777 test chmod 0777 test/zdtm/transition/ chmod 0777 test/zdtm/static From 1a6c015276631333d7dc703566bff55572bbcfe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 29 May 2023 17:22:16 +0200 Subject: [PATCH 146/321] zdtm: If ignoring kernel taint, also ignore taint changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At least in Google's VM environment, the kernel taints are unrelated to CRIU runs. Don't fail tests if taints change, if kernel taints are ignored. Signed-off-by: MichaÅ‚ MirosÅ‚aw --- test/zdtm.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index c6e852dc1a..bc14e3f73e 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2003,12 +2003,20 @@ def __init__(self, opts, nr_tests): file=self.__file_report) print(u"# ", file=self.__file_report) print(u"1.." + str(nr_tests), file=self.__file_report) - with open("/proc/sys/kernel/tainted") as taintfd: - self.__taint = taintfd.read() + self.__taint = self.__read_kernel_tainted() if int(self.__taint, 0) != 0: - print("The kernel is tainted: %r" % self.__taint) - if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != '1': - raise Exception("The kernel is tainted: %r" % self.__taint) + self.__report_kernel_taint("The kernel is tainted: %r" % self.__taint) + + @staticmethod + def __read_kernel_tainted(): + with open("/proc/sys/kernel/tainted") as taintfd: + return taintfd.read().strip() + + @staticmethod + def __report_kernel_taint(msg): + print(msg) + if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != "1": + raise Exception(msg) def __show_progress(self, msg): perc = int(self.__nr * 16 / self.__total) @@ -2034,11 +2042,12 @@ def run_test(self, name, desc, flavor): if len(self.__subs) >= self.__max: self.wait() - with open("/proc/sys/kernel/tainted") as taintfd: - taint = taintfd.read() + taint = self.__read_kernel_tainted() if self.__taint != taint: - raise Exception("The kernel is tainted: %r (%r)" % - (taint, self.__taint)) + prev_taint = self.__taint + self.__taint = taint + self.__report_kernel_taint( + "The kernel is tainted: %r (was %r)" % (taint, prev_taint)) ''' The option --link-remap allows criu to hardlink open files back to the From f74140c16a4aeabc5a73136d3ea801f5ceb71c1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 30 May 2023 20:00:10 +0200 Subject: [PATCH 147/321] zdtm: cgroup04: Improve error messages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the errno values reported by cgroup04 always correct and showing relevant parameters. Constify constant strings, while at it. Signed-off-by: MichaÅ‚ MirosÅ‚aw --- test/zdtm/static/cgroup04.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index 8c40ffd6bd..f586a0628d 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -17,25 +17,25 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); -static const char *cgname = "zdtmtst"; +static const char *const cgname = "zdtmtst"; int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", dirname); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", subdir); return -1; } if (mount("none", subdir, "cgroup", 0, controller)) { - pr_perror("Can't mount cgroups"); + pr_perror("Can't mount cgroup controller %s at %s", controller, subdir); goto err_rd; } @@ -52,7 +52,8 @@ int mount_and_add(const char *controller, const char *path, const char *prop, co goto err_rs; ssprintf(paux, "%s/%s/special_prop_check", subdir, path); - mkdir(paux, 0600); + if (mkdir(paux, 0600) < 0) + pr_perror("Can't make dir %s", paux); return 0; err_rs: @@ -74,11 +75,11 @@ bool checkval(char *path, char *val) } n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) + pr_perror("read %s", path); close(fd); - if (n < 0) { - pr_perror("read"); + if (n < 0) return false; - } buf[n] = 0; if (strcmp(val, buf)) { @@ -95,7 +96,7 @@ int main(int argc, char **argv) char buf[1024], path[PATH_MAX]; struct stat sb; - char *dev_allow[] = { + const char *const dev_allow[] = { "c *:* m", "b *:* m", "c 1:3 rwm", "c 1:5 rwm", "c 1:7 rwm", "c 5:0 rwm", "c 5:2 rwm", "c 1:8 rwm", "c 1:9 rwm", "c 136:* rwm", "c 10:229 rwm", }; @@ -126,12 +127,14 @@ int main(int argc, char **argv) sprintf(path, "%s/devices/%s/devices.list", dirname, cgname); if (!checkval(path, buf)) { + errno = 0; fail(); goto out; } sprintf(path, "%s/memory/%s/memory.limit_in_bytes", dirname, cgname); if (!checkval(path, "268435456\n")) { + errno = 0; fail(); goto out; } @@ -143,6 +146,7 @@ int main(int argc, char **argv) } if (!S_ISDIR(sb.st_mode)) { + errno = 0; fail("special_prop_check not a directory?"); goto out; } From 66369f917e2f8504ae407ea0709cd9ab6a5792b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 10:20:09 +0200 Subject: [PATCH 148/321] zdtm: cgroup04: Improve skip check's robustness. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup04 test needs full control over mem and devices cgroup hierarchies. Make the test's .checkskip script better at detecting if the cgroups are available for use. Signed-off-by: MichaÅ‚ MirosÅ‚aw --- test/zdtm/static/cgroup04.checkskip | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/cgroup04.checkskip b/test/zdtm/static/cgroup04.checkskip index 205f8fc530..1ccbada4d0 100755 --- a/test/zdtm/static/cgroup04.checkskip +++ b/test/zdtm/static/cgroup04.checkskip @@ -1,3 +1,20 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +for ctl in devices memory; do + # Check that the controller is available. + + grep -q "^${ctl}\\s" /proc/cgroups + + # Check that the controller is not co-mounted with any other. + + # /proc/self/cgroup may have: + # "1:devices:/sys" + if ! grep -q "^[0-9]*:${ctl}:" /proc/self/cgroup; then + # but not eg: + # "1:devices,job:/sys" + grep -qE "^[0-9]*:([^:]*,)?${ctl}(,[^:]*)?:" /proc/self/cgroup && exit 1 + fi +done From 87c42d5d141b88714952764b61fcb5c99f5e75a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 21 Apr 2023 15:57:32 +0200 Subject: [PATCH 149/321] zdtm: Treat ESRCH from kill() as success. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a failure to clean up after a failed test, where CRIU didn't start properly. ``` ===================== Run zdtm/transition/socket-tcp in h ====================== Start test ./socket-tcp --pidfile=socket-tcp.pid --outfile=socket-tcp.out Traceback (most recent call last): File ".../zdtm_py.py", line 1906, in do_run_test cr(cr_api, t, opts) File ".../zdtm_py.py", line 1584, in cr cr_api.dump("dump") File ".../zdtm_py.py", line 1386, in dump self.__dump_process = self.__criu_act(action, File ".../zdtm_py.py", line 1224, in __criu_act raise test_fail_exc("CRIU %s" % action) test_fail_exc: CRIU dump During handling of the above exception, another exception occurred: Traceback (most recent call last): File "", line 182, in run_filename_from_loader_as_main File "", line 34, in _run_code_in_main File ".../zdtm_py.py", line 2790, in fork_zdtm() File ".../zdtm_py.py", line 2782, in fork_zdtm do_run_test(tinfo[0], tinfo[1], tinfo[2], tinfo[3]) File ".../zdtm_py.py", line 1922, in do_run_test t.kill() File ".../zdtm_py.py", line 509, in kill os.kill(int(self.__pid), sig) ProcessLookupError: [Errno 3] No such process ``` Signed-off-by: MichaÅ‚ MirosÅ‚aw --- test/zdtm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index bc14e3f73e..8108735752 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -507,8 +507,15 @@ def kill(self, sig=signal.SIGKILL): self.__freezer.thaw() if self.__pid: print("Send the %d signal to %s" % (sig, self.__pid)) - os.kill(int(self.__pid), sig) - self.gone(sig == signal.SIGKILL) + try: + os.kill(int(self.__pid), sig) + except ProcessLookupError: + if sig != signal.SIGKILL: + raise + print("The process %s doesn't exist" % self.__pid) + self.gone(True) + else: + self.gone(sig == signal.SIGKILL) self.__flavor.fini() From a056519daec43deddd6bb8cc5573471f4aa98a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 15:31:10 +0200 Subject: [PATCH 150/321] zdtm: socket_udp_shutdown: Make the test fail instead of timing out. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When -- after restore -- sockets can't communicate, the test times out while waiting on recvfrom(). Since the communication is local, send() works instantaneously - so mark sockets with SOCK_NONBLOCK and report failure if the message is not received immediately. Signed-off-by: MichaÅ‚ MirosÅ‚aw --- test/zdtm/static/socket_udp_shutdown.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/zdtm/static/socket_udp_shutdown.c b/test/zdtm/static/socket_udp_shutdown.c index 91dc8f30a4..a7658b9dd7 100644 --- a/test/zdtm/static/socket_udp_shutdown.c +++ b/test/zdtm/static/socket_udp_shutdown.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) test_init(argc, argv); - sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); - sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + sk1 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); + sk2 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); if (sk1 < 0 || sk2 < 0) { pr_perror("Can't create socket"); exit(1); From 8b5f3af0a3f911f690685986c53a9e5cc69cdeed Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 26 Sep 2023 17:00:36 -0700 Subject: [PATCH 151/321] zdtm: check userns once All test logs are flooded with the "userns is supported" messages... Signed-off-by: Andrei Vagin --- test/zdtm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index 8108735752..7a7cdfd3b6 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2404,6 +2404,7 @@ def run_tests(opts): "Specify --criu-image-streamer-dir or modify PATH to provide an alternate location") .format(streamer_dir)) + usernsIsSupported = criu.check("userns") launcher = Launcher(opts, len(torun)) try: for t in torun: @@ -2473,7 +2474,7 @@ def run_tests(opts): run_flavs = set(test_flavs) & set(opts_flavs) else: run_flavs = set([test_flavs.pop()]) - if not criu.check("userns"): + if not usernsIsSupported: run_flavs -= set(['uns']) if opts['user']: # FIXME -- probably uns will make sense From 1e4f5fb2b6ba6259cf4c51ff24b0522e0101f44a Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Tue, 20 Jun 2023 13:23:24 +0400 Subject: [PATCH 152/321] Return page size as unsigned long Currently page_size() returns unsigned int value that is after "bitwise not" is promoted to unsigned long value e.g. in uffd.c handle_page_fault. Since the value is unsigned promotion is done with 0 MSB that results in lost of MSB pagefault address bits. So make page_size to return unsigned long to avoid such situation. Signed-off-by: Vladislav Khmelevsky --- compel/plugins/std/infect.c | 2 +- criu/pie/restorer.c | 2 +- include/common/arch/aarch64/asm/page.h | 4 ++-- include/common/arch/loongarch64/asm/page.h | 4 ++-- include/common/arch/mips/asm/page.h | 4 ++-- include/common/arch/ppc64/asm/page.h | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index abecc140f1..60b21d3132 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -27,7 +27,7 @@ static struct rt_sigframe *sigframe; */ static unsigned __page_size; -unsigned __attribute((weak)) page_size(void) +unsigned long __attribute((weak)) page_size(void) { return __page_size; } diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0de2423a15..ba6f290dc8 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -104,7 +104,7 @@ bool fault_injected(enum faults f) * Hint: compel on aarch64 shall learn relocs for that. */ static unsigned __page_size; -unsigned page_size(void) +unsigned long page_size(void) { return __page_size; } diff --git a/include/common/arch/aarch64/asm/page.h b/include/common/arch/aarch64/asm/page.h index 90670d1265..4555debbdc 100644 --- a/include/common/arch/aarch64/asm/page.h +++ b/include/common/arch/aarch64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/loongarch64/asm/page.h b/include/common/arch/loongarch64/asm/page.h index 25bdbc1412..4fcdb64dc1 100644 --- a/include/common/arch/loongarch64/asm/page.h +++ b/include/common/arch/loongarch64/asm/page.h @@ -10,7 +10,7 @@ static unsigned __page_size; static unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -31,7 +31,7 @@ static inline unsigned page_shift(void) #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/mips/asm/page.h b/include/common/arch/mips/asm/page.h index 25bdbc1412..4fcdb64dc1 100644 --- a/include/common/arch/mips/asm/page.h +++ b/include/common/arch/mips/asm/page.h @@ -10,7 +10,7 @@ static unsigned __page_size; static unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -31,7 +31,7 @@ static inline unsigned page_shift(void) #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/ppc64/asm/page.h b/include/common/arch/ppc64/asm/page.h index a1ff6718ad..2b0c0e5042 100644 --- a/include/common/arch/ppc64/asm/page.h +++ b/include/common/arch/ppc64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ From 4f0c07fd543477e6dc4d2bea4e70d9f121870db2 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Tue, 20 Jun 2023 13:34:27 +0400 Subject: [PATCH 153/321] vma: Add !VVAR condition to vma_entry_can_be_lazy Currently most of the times we don't have problems with VVAR segment and lazy restore because when VDSO is parked there is an munmap call that calls UFFDIO_UNREGISTER on the destination address. But we don't want to enable userfaultfd for VDSO and VVAR at the first place. --- criu/include/vma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/include/vma.h b/criu/include/vma.h index 106c56af26..4b663ee500 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -122,8 +122,8 @@ static inline struct vma_area *vma_next(struct vma_area *vma) static inline bool vma_entry_can_be_lazy(VmaEntry *e) { return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && - !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && - !(e->flags & MAP_HUGETLB)); + !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VVAR)) && + !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && !(e->flags & MAP_HUGETLB)); } #endif /* __CR_VMA_H__ */ From 5de9040ee758f1fd1a2599b6f800013544c966b6 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Sat, 30 Sep 2023 00:26:09 +0200 Subject: [PATCH 154/321] criu: change the comment about magic numbers Signed-off-by: Michal Clapinski --- criu/include/magic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/magic.h b/criu/include/magic.h index 22d7218e45..0e8c37234e 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -29,7 +29,7 @@ /* * The magic-s below correspond to coordinates - * of various Russian towns in the NNNNEEEE form. + * of various towns in the NNNNEEEE form. */ #define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ From f832d875e398b922c0909f062d6e946923215063 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 21:38:45 -0700 Subject: [PATCH 155/321] plugins: the UPDATE_VMA_MAP callback returns fd with the full control It means CRIU has to close it when it is not needed. It looks more logically correct and matches the behaviour of the RESTORE_EXT_FILE callback. Signed-off-by: Andrei Vagin --- criu/files-reg.c | 2 +- plugins/amdgpu/amdgpu_plugin.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index cf0c84b52e..c80da1d8ce 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2506,7 +2506,7 @@ static int open_filemap(int pid, struct vma_area *vma) * using dup because dup returns a reference to the same struct file inside kernel, but we * cannot open a new FD. */ - ret = dup(plugin_fd); + ret = plugin_fd; } else if (vma->e->status & VMA_AREA_MEMFD) { if (!inherited_fd(vma->vmfd, &ret)) ret = memfd_open(vma->vmfd, &flags); diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 6a79f8b19d..9dae8861cb 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1955,10 +1955,15 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) { *new_offset = vma_md->new_pgoff; - if (is_renderD) - *updated_fd = vma_md->fd; - else - *updated_fd = -1; + *updated_fd = -1; + if (is_renderD) { + int fd = dup(vma_md->fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + *updated_fd = fd; + } plugin_log_msg("old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, vma_md->new_pgoff, *updated_fd); From 25f685e2de4f45ff7114691cc3365d843351cc60 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 21:47:50 -0700 Subject: [PATCH 156/321] amdgpu: don't leak fd on an error path in open_img_file Signed-off-by: Andrei Vagin --- plugins/amdgpu/amdgpu_plugin.c | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 9dae8861cb..e22168d931 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -165,6 +165,7 @@ FILE *open_img_file(char *path, bool write, size_t *size) fp = fdopen(fd, write ? "w" : "r"); if (!fp) { pr_perror("%s: Failed get pointer for %s", path, write ? "write" : "read"); + close(fd); return NULL; } From 01d559dbe946fe7909c27f4a649c359ee8cf55a1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 21:49:39 -0700 Subject: [PATCH 157/321] amdgpu: print an error if the dup syscall fails Signed-off-by: Andrei Vagin --- plugins/amdgpu/amdgpu_plugin.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index e22168d931..2ebc5e1786 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1796,7 +1796,12 @@ int amdgpu_plugin_restore_file(int id) * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in * tp_node. */ - return dup(fd); + fd = dup(fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + return fd; } fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); From f5932572bbb219a63204ee00e8ccb6afb8d8e7a0 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 30 Sep 2023 02:56:52 +0100 Subject: [PATCH 158/321] ci: enable build with amdgpu plugin This patch adds the `libdrm-dev` package to the list of CRIU dependencies installed in CI to build CRIU with amdgpu plugin. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 4 ++-- plugins/amdgpu/kfd_ioctl.h | 2 +- scripts/build/Dockerfile.alpine | 1 + scripts/build/Dockerfile.archlinux | 1 + scripts/ci/prepare-for-fedora-rawhide.sh | 1 + scripts/ci/run-ci-tests.sh | 7 ++++++- 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index e559ec772a..6a586d58b5 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto libdrm-devel systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is @@ -108,7 +108,7 @@ task: yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index b88fe20cfe..e1ebb75a3a 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -23,7 +23,7 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include +#include #include /* diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index af1858ab58..cb746757a4 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -23,6 +23,7 @@ RUN apk update && apk add \ python3 \ sudo \ libcap-utils \ + libdrm-dev \ util-linux COPY . /criu diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index f2bce1e5ba..b9968e876b 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -35,6 +35,7 @@ RUN pacman -Syu --noconfirm \ asciidoctor \ python-junit-xml \ python-importlib-metadata \ + libdrm \ diffutils COPY . /criu diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 1c8a46fbfd..e31814a955 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -35,6 +35,7 @@ dnf install -y \ which \ e2fsprogs \ rubygem-asciidoctor \ + libdrm-devel \ kmod # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 47749e7fa8..1aae555f76 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -6,7 +6,7 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata python3-junit.xml) + python3-importlib-metadata python3-junit.xml libdrm-dev) X86_64_PKGS=(gcc-multilib) @@ -326,3 +326,8 @@ make -C test/others/action-script run # compel testing make -C compel/test + +# amdgpu_plugin testing +make amdgpu_plugin +make -C plugins/amdgpu/ test_topology_remap +./plugins/amdgpu/test_topology_remap From 2ff90f01b9c82b3b34758dee882f18f44f8e0b2f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 1 Oct 2023 08:58:59 +0100 Subject: [PATCH 159/321] amdgpu: fix clang warnings amdgpu_plugin.c:930:6: error: variable 'buffer' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] if (ret) { ^~~ amdgpu_plugin.c:988:8: note: uninitialized use occurs here xfree(buffer); Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 2ebc5e1786..32ff8f9364 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -915,7 +915,7 @@ void *dump_bo_contents(void *_thread_data) int num_bos = 0; int i, ret = 0; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; pr_info("Thread[0x%x] started\n", thread_data->gpu_id); @@ -1004,7 +1004,7 @@ void *restore_bo_contents(void *_thread_data) uint64_t max_copy_size; uint32_t major, minor; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; int num_bos = 0; int i, ret = 0; From f54cf1912a204634c01ed124580952e3f3c11a61 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 22 Sep 2023 22:40:27 +0000 Subject: [PATCH 160/321] memfd: don't reopen file descriptors for memory mappings One memfd can be shared by a few restored files. Only of these files is restored with a file created with memfd_open. Others are restored by reopening memfd files via /proc/self/fd/. It seems unnecessary for restoring memfd memory mappings. We can always use the origin file. Signed-off-by: Andrei Vagin --- criu/files-reg.c | 2 +- criu/include/memfd.h | 4 +++- criu/memfd.c | 9 ++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index c80da1d8ce..9fbab0d427 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2509,7 +2509,7 @@ static int open_filemap(int pid, struct vma_area *vma) ret = plugin_fd; } else if (vma->e->status & VMA_AREA_MEMFD) { if (!inherited_fd(vma->vmfd, &ret)) - ret = memfd_open(vma->vmfd, &flags); + ret = memfd_open(vma->vmfd, &flags, true); } else { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); } diff --git a/criu/include/memfd.h b/criu/include/memfd.h index 1b1dc79bbc..78d8100198 100644 --- a/criu/include/memfd.h +++ b/criu/include/memfd.h @@ -1,7 +1,9 @@ #ifndef __CR_MEMFD_H__ #define __CR_MEMFD_H__ +#include #include + #include "int.h" #include "common/config.h" @@ -12,7 +14,7 @@ extern int is_memfd(dev_t dev); extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); extern const struct fdtype_ops memfd_dump_ops; -extern int memfd_open(struct file_desc *d, u32 *fdflags); +extern int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap); extern struct collect_image_info memfd_cinfo; extern struct file_desc *collect_memfd(u32 id); extern int apply_memfd_seals(void); diff --git a/criu/memfd.c b/criu/memfd.c index a770c66a11..9d9f0621fc 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -323,7 +323,7 @@ static int memfd_open_inode(struct memfd_restore_inode *inode) return fd; } -int memfd_open(struct file_desc *d, u32 *fdflags) +int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) { struct memfd_info *mfi; MemfdFileEntry *mfe; @@ -342,6 +342,9 @@ int memfd_open(struct file_desc *d, u32 *fdflags) /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; + if (filemap && (flags & O_ACCMODE) == O_RDWR) + return fd; + if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { /* * If there is only a single RW-opened fd for a memfd, it can @@ -367,7 +370,7 @@ int memfd_open(struct file_desc *d, u32 *fdflags) _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); - else if ((flags & O_ACCMODE) == O_RDWR) + else if (!filemap && (flags & O_ACCMODE) == O_RDWR) pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); close(fd); @@ -382,7 +385,7 @@ static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) if (inherited_fd(d, new_fd)) return 0; - fd = memfd_open(d, NULL); + fd = memfd_open(d, NULL, false); if (fd < 0) return -1; From c20fb834747f6fc76428cc1d8bb8cba8013aaa8d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 18:07:13 +0000 Subject: [PATCH 161/321] zdtm/memfd04: check execveat on memfd that has memory mappings Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 2 ++ test/zdtm/static/memfd04.c | 33 ++++++++++++++++++++++++++++++--- test/zdtm/static/memfd05.c | 1 + test/zdtm/static/memfd05.desc | 1 + 4 files changed, 34 insertions(+), 3 deletions(-) create mode 120000 test/zdtm/static/memfd05.c create mode 120000 test/zdtm/static/memfd05.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index b7fb79643c..4c7ca72fdf 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -259,6 +259,7 @@ TST_NOFILE := \ memfd02-hugetlb \ memfd03 \ memfd04 \ + memfd05 \ shmemfd \ shmemfd-priv \ time \ @@ -656,6 +657,7 @@ socket-tcp6-unconn: CFLAGS += -D ZDTM_IPV6 socket-tcp4v6-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK -D ZDTM_IPV4V6 socket-tcp4v6-closing: CFLAGS += -D ZDTM_IPV4V6 memfd02-hugetlb: CFLAGS += -D ZDTM_HUGETLB +memfd05: CFLAGS += -D ZDTM_MEMFD05 sockets00-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET sockets01-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET diff --git a/test/zdtm/static/memfd04.c b/test/zdtm/static/memfd04.c index aae7864c10..215e949d15 100644 --- a/test/zdtm/static/memfd04.c +++ b/test/zdtm/static/memfd04.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -73,20 +74,46 @@ static const size_t script_len = sizeof(script) - 1; int main(int argc, char *argv[]) { +#ifdef MEMFD05 + char path[PATH_MAX]; + char *addr_p, *addr_s; + int rofd; +#endif int fd; test_init(argc, argv); fd = _memfd_create("somename", 0); if (fd < 0) { - fail("memfd_create()"); + pr_perror("memfd_create()"); + return 1; + } + if (ftruncate(fd, script_len) == -1) { + pr_perror("ftruncate"); return 1; } - if (write(fd, script, script_len) != script_len) { - fail("write(memfd)"); + pr_perror("write(memfd)"); + return 1; + } +#ifdef MEMFD05 + snprintf(path, PATH_MAX - 1, "/proc/self/fd/%d", fd); + rofd = open(path, O_RDONLY); + if (rofd < 0) { + pr_perror("unable to open read-only memfd"); + return 1; + } + addr_p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, rofd, 0); + if (addr_p == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + addr_s = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr_s == MAP_FAILED) { + pr_perror("mmap"); return 1; } +#endif if (!test_exec_fd(fd)) return 1; diff --git a/test/zdtm/static/memfd05.c b/test/zdtm/static/memfd05.c new file mode 120000 index 0000000000..6caa9556fb --- /dev/null +++ b/test/zdtm/static/memfd05.c @@ -0,0 +1 @@ +memfd04.c \ No newline at end of file diff --git a/test/zdtm/static/memfd05.desc b/test/zdtm/static/memfd05.desc new file mode 120000 index 0000000000..1b4963572b --- /dev/null +++ b/test/zdtm/static/memfd05.desc @@ -0,0 +1 @@ +memfd04.desc \ No newline at end of file From 5bf7652504530b9baa04f411052f9c079c7f14eb Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 30 Aug 2023 11:57:40 +0800 Subject: [PATCH 162/321] clang-format: disable column limit constraint The "ColumnLimit: 120" is not only allowing lines to be longer than 80 characters but it also forces line wrapping at 120 characters. If total expression length is more than 120 characters, clang-format will try to wrap it as close to 120 as it can, it would not even allow to wrap at 80 characters if we really want it. But as we all know 80 characters is Linux kernel coding style default and as far as our coding style is based on it it is really strange to prohibit wrapping lines at 80 characters... Signed-off-by: Pavel Tikhomirov --- .clang-format | 2 +- scripts/fetch-clang-format.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.clang-format b/.clang-format index 4756380158..fb40bc613b 100644 --- a/.clang-format +++ b/.clang-format @@ -53,7 +53,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false -ColumnLimit: 120 +ColumnLimit: 0 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false # Unknown to clang-format-4.0 ConstructorInitializerAllOnOneLineOrOnePerLine: false diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index b80175f05b..5b6037d619 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -8,7 +8,7 @@ URL="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/.c curl -s "${URL}" | sed -e " s,^\( *\)#\([A-Z]\),\1\2,g; s,ControlStatements,ControlStatementsExceptForEachMacros,g; - s,ColumnLimit: 80,ColumnLimit: 120,g; + s,ColumnLimit: 80,ColumnLimit: 0,g; s,Intended for clang-format >= 4,Intended for clang-format >= 11,g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_bit',g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; From e3391ed60ebd9cf91149340ce4bfd074172cbfac Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 14 Feb 2022 11:47:30 +0000 Subject: [PATCH 163/321] pie: Mark __export_*() functions as externally_visible GCC's lto source: > To avoid this problem the compiler must assume that it sees the > whole program when doing link-time optimization. Strictly > speaking, the whole program is rarely visible even at link-time. > Standard system libraries are usually linked dynamically or not > provided with the link-time information. In GCC, the whole > program option (@option{-fwhole-program}) asserts that every > function and variable defined in the current compilation > unit is static, except for function @code{main} (note: at > link time, the current unit is the union of all objects compiled > with LTO). Since some functions and variables need to > be referenced externally, for example by another DSO or from an > assembler file, GCC also provides the function and variable > attribute @code{externally_visible} which can be used to disable > the effect of @option{-fwhole-program} on a specific symbol. As far as I read gcc's source, ipa_comdats() will avoid placing symbols that are either already in a user-defined section or have externally_visible attribute into new optimized gcc sections. Signed-off-by: Dmitry Safonov Signed-off-by: Andrei Vagin --- criu/pie/restorer.c | 6 +++--- include/common/compiler.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index ba6f290dc8..02971657ef 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -735,7 +735,7 @@ static int recv_cg_set_restore_ack(int sk) * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. */ -long __export_restore_thread(struct thread_restore_args *args) +__visible long __export_restore_thread(struct thread_restore_args *args) { struct rt_sigframe *rt_sigframe; k_rtsigset_t to_block; @@ -1276,7 +1276,7 @@ unsigned long vdso_rt_size = 0; void *bootstrap_start = NULL; unsigned int bootstrap_len = 0; -void __export_unmap(void) +__visible void __export_unmap(void) { sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } @@ -1608,7 +1608,7 @@ static int restore_membarrier_registrations(int mask) * and jump execution to some predefined ip read from * core file. */ -long __export_restore_task(struct task_restore_args *args) +__visible long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; diff --git a/include/common/compiler.h b/include/common/compiler.h index 1c9d3db8d6..1347b62362 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -30,6 +30,17 @@ #define __always_unused __attribute__((unused)) #define __must_check __attribute__((__warn_unused_result__)) +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +/* Not supported by clang */ +#if __has_attribute(__externally_visible__) +#define __visible __attribute__((__externally_visible__)) +#else +#define __visible +#endif + #define __section(S) __attribute__((__section__(#S))) #ifndef __always_inline From 24e24921bf1fd44e62907baa38e84fa2094a8364 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 21:26:41 +0000 Subject: [PATCH 164/321] util: allow to run criu under strace fork_and_ptrace_attach has to fork a child with CLONE_UNTRACED, so that strace doesn't trace it. Signed-off-by: Andrei Vagin --- criu/util.c | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/criu/util.c b/criu/util.c index 993ab97bb8..95ba0feda6 100644 --- a/criu/util.c +++ b/criu/util.c @@ -661,40 +661,54 @@ int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], un return ret; } +struct child_args { + int *sk_pair; + int (*child_setup)(void); +}; + +static int child_func(void *_args) +{ + struct child_args *args = _args; + int sk, *sk_pair = args->sk_pair; + char c = 0; + + sk = sk_pair[1]; + close(sk_pair[0]); + + if (args->child_setup && args->child_setup() != 0) + exit(1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + exit(1); + } + + while (1) + sleep(1000); + exit(1); +} + pid_t fork_and_ptrace_attach(int (*child_setup)(void)) { pid_t pid; int sk_pair[2], sk; char c = 0; + struct child_args cargs = { + .sk_pair = sk_pair, + .child_setup = child_setup, + }; if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } - pid = fork(); + pid = clone_noasan(child_func, CLONE_UNTRACED | SIGCHLD, &cargs); if (pid < 0) { pr_perror("fork"); return -1; } - if (pid == 0) { - sk = sk_pair[1]; - close(sk_pair[0]); - - if (child_setup && child_setup() != 0) - exit(1); - - if (write(sk, &c, 1) != 1) { - pr_perror("write"); - exit(1); - } - - while (1) - sleep(1000); - exit(1); - } - sk = sk_pair[0]; close(sk_pair[1]); From 4e5247a264bbec2b0c6c3936d695bf61cf0b279d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 5 Oct 2023 15:16:36 -0700 Subject: [PATCH 165/321] tun: don't parse buffers that have not been filled with data read_ns_sys_file() can return an error, but we are trying to parse a buffer before checking a return code. CID 417395 (#3 of 3): String not null terminated (STRING_NULL) 2. string_null: Passing unterminated string buf to strtol, which expects a null-terminated string. Signed-off-by: Andrei Vagin --- criu/net.c | 7 +++++-- criu/tun.c | 15 +++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/criu/net.c b/criu/net.c index 4abfc182a8..e422e2f69b 100644 --- a/criu/net.c +++ b/criu/net.c @@ -111,15 +111,18 @@ int read_ns_sys_file(char *path, char *buf, int len) } rlen = read(fd, buf, len); + if (rlen == -1) + pr_perror("Can't read ns' %s", path); close(fd); if (rlen == len) { + buf[0] = '\0'; pr_err("Too small buffer to read ns sys file %s\n", path); return -1; } - if (rlen > 0) - buf[rlen - 1] = '\0'; + if (rlen >= 0) + buf[rlen] = '\0'; return rlen; } diff --git a/criu/tun.c b/criu/tun.c index 2e2cc32bf5..9d66f99296 100644 --- a/criu/tun.c +++ b/criu/tun.c @@ -455,27 +455,26 @@ int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **in TunLinkEntry tle = TUN_LINK_ENTRY__INIT; char spath[64]; char buf[64]; - int ret = 0; struct tun_link *tl; sprintf(spath, "class/net/%s/tun_flags", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.flags = strtol(buf, NULL, 0); sprintf(spath, "class/net/%s/owner", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.owner = strtol(buf, NULL, 10); sprintf(spath, "class/net/%s/group", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.group = strtol(buf, NULL, 10); - if (ret < 0) - return ret; - tl = get_tun_link_fd(nde->name, nde->peer_nsid, tle.flags); if (!tl) - return ret; + return -1; tle.vnethdr = tl->dmp.vnethdr; tle.sndbuf = tl->dmp.sndbuf; From 3015aade8cbb1d06cf7a9db1a9eca18058c4f8d2 Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:07:13 -0500 Subject: [PATCH 166/321] apparmor: remove the redundant check This check is redundant as line 201 checks for this condition. Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- criu/apparmor.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/criu/apparmor.c b/criu/apparmor.c index 5b62759e23..e46e239f59 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -207,8 +207,6 @@ static int by_time(const struct dirent **de1, const struct dirent **de2) } else { if (sb1.st_mtim.tv_sec < sb2.st_mtim.tv_sec) return -1; - if (sb1.st_mtim.tv_sec == sb2.st_mtim.tv_sec) - return 0; return 1; } } From 9e05b658763f37862e654cbccc6ea1724081922c Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:08:37 -0500 Subject: [PATCH 167/321] arch/x86: remove the redundant check The is_native field is a boolean. Therefore, else if() should can be changed to a simple else{}. Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- criu/arch/x86/sigframe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/arch/x86/sigframe.c b/criu/arch/x86/sigframe.c index 4fa7eb3dc9..46612e70d3 100644 --- a/criu/arch/x86/sigframe.c +++ b/criu/arch/x86/sigframe.c @@ -23,7 +23,7 @@ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *r } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; - } else if (!sigframe->is_native) { + } else { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; if ((addr % 64ul)) { From 06a3f130463e50c373099401e776fdf9db279cf8 Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:18:16 -0500 Subject: [PATCH 168/321] zdtm/cow00: fix typo The condition meant to check fd2 instead of fd1, which is checked in line 24. Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- test/zdtm/static/cow00.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/cow00.c b/test/zdtm/static/cow00.c index cb0c6733ea..456b6a7b4b 100644 --- a/test/zdtm/static/cow00.c +++ b/test/zdtm/static/cow00.c @@ -29,7 +29,7 @@ static int is_cow(void *addr, pid_t p1, pid_t p2) snprintf(buf, sizeof(buf), "/proc/%d/pagemap", p2); fd2 = open(buf, O_RDONLY); - if (fd1 < 0) { + if (fd2 < 0) { pr_perror("Unable to open file %s", buf); return -1; } From c03c737f6cfadc0bcbdaf30bed6eb31e8085ae52 Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:19:09 -0500 Subject: [PATCH 169/321] zdtm/thread_different_uid_gid: remove the redundant check line 131 checks if (ret >= 0). line 133 could be replaced by a simple else statement Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- test/zdtm/static/thread_different_uid_gid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/thread_different_uid_gid.c b/test/zdtm/static/thread_different_uid_gid.c index 3a0b6291b1..88f99659b3 100644 --- a/test/zdtm/static/thread_different_uid_gid.c +++ b/test/zdtm/static/thread_different_uid_gid.c @@ -130,7 +130,7 @@ int main(int argc, char **argv) ret = syscall(SYS_setresgid, maingroup, maingroup, maingroup); if (ret >= 0) { ret = syscall(SYS_setresuid, mainuser, mainuser, mainuser); - } else if (ret < 0) { + } else { pr_perror("Failed to drop privileges"); exit(1); } From ab73a8404c43e628364f40b46b4bd26b7d750996 Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:17:38 -0500 Subject: [PATCH 170/321] criu/proc_parse: refactor the eventpoll parser Eventpollentry's fields are set only when ret == 3 or ret == 6. The remaining cases can be grouped together to an error Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- criu/proc_parse.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 16392e3864..2b94050350 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1972,10 +1972,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) " pos:%lli ino:%lx sdev:%x", &e->tfd, &e->events, (long long *)&e->data, (long long *)&e->pos, (long *)&e->inode, &e->dev); - if (ret < 3 || ret > 6) { - eventpoll_tfd_entry__free_unpacked(e, NULL); - goto parse_err; - } else if (ret == 3) { + if (ret == 3) { e->has_dev = false; e->has_inode = false; e->has_pos = false; @@ -1983,7 +1980,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) e->has_dev = true; e->has_inode = true; e->has_pos = true; - } else if (ret < 6) { + } else { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } From 811a380ee407e9106a60386d13486054eba7e967 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 11 Oct 2023 14:21:34 +0200 Subject: [PATCH 171/321] files-reg: don't change the file pos in get_build_id At this point the correct position is already restored, so reading from the fd results in the position being moved forward by 5 bytes. Fixes: 9191f8728d62 ("criu/files-reg.c: add build-id validation functionality") Signed-off-by: Michal Clapinski --- criu/files-reg.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index 9fbab0d427..fc61493501 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1650,22 +1650,10 @@ static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, co */ static int get_build_id(const int fd, const struct stat *fd_status, unsigned char **build_id) { - char buf[SELFMAG + 1]; - void *start_addr; + char *start_addr; size_t mapped_size; int ret = -1; - if (read(fd, buf, SELFMAG + 1) != SELFMAG + 1) - return -1; - - /* - * The first 4 bytes contain a magic number identifying the file as an - * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and - * ‘F’, respectively. These characters are together defined as ELFMAG. - */ - if (strncmp(buf, ELFMAG, SELFMAG)) - return -1; - /* * If the build-id exists, then it will most likely be present in the * beginning of the file. Therefore at most only the first 1 MB of the @@ -1673,16 +1661,25 @@ static int get_build_id(const int fd, const struct stat *fd_status, unsigned cha */ mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); - if (start_addr == MAP_FAILED) { + if ((void*)start_addr == MAP_FAILED) { pr_warn("Couldn't mmap file with fd %d\n", fd); return -1; } - if (buf[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(start_addr, build_id, fd, mapped_size); - if (buf[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(start_addr, build_id, fd, mapped_size); + /* + * The first 4 bytes contain a magic number identifying the file as an + * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and + * ‘F’, respectively. These characters are together defined as ELFMAG. + */ + if (memcmp(start_addr, ELFMAG, SELFMAG)) + goto out; + if (start_addr[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32((Elf32_Ehdr *)start_addr, build_id, fd, mapped_size); + if (start_addr[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64((Elf64_Ehdr *)start_addr, build_id, fd, mapped_size); + +out: munmap(start_addr, mapped_size); return ret; } From d9ca0c7b528534637ef2495ba1e3000eb821cba2 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 11 Oct 2023 14:26:39 +0200 Subject: [PATCH 172/321] zdtm/lib: add missing signal.h header Signed-off-by: Michal Clapinski --- test/zdtm/lib/lock.h | 1 + 1 file changed, 1 insertion(+) diff --git a/test/zdtm/lib/lock.h b/test/zdtm/lib/lock.h index 2b23550be5..cc5306e060 100644 --- a/test/zdtm/lib/lock.h +++ b/test/zdtm/lib/lock.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "asm/atomic.h" #define BUG_ON(condition) \ From 42c1c84b95f9ddceb0228416005aab4467650cc0 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 11 Oct 2023 14:27:29 +0200 Subject: [PATCH 173/321] zdtm/static: test the offset migration of ELF files Signed-off-by: Michal Clapinski --- test/zdtm/static/Makefile | 1 + test/zdtm/static/fd_offset.c | 42 ++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 test/zdtm/static/fd_offset.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 4c7ca72fdf..07d3bc6e21 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -269,6 +269,7 @@ TST_NOFILE := \ sigtrap \ sigtrap01 \ change_mnt_context \ + fd_offset \ # jobctl00 \ PKG_CONFIG ?= pkg-config diff --git a/test/zdtm/static/fd_offset.c b/test/zdtm/static/fd_offset.c new file mode 100644 index 0000000000..96255a4a1f --- /dev/null +++ b/test/zdtm/static/fd_offset.c @@ -0,0 +1,42 @@ +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check that criu properly restores offsets on ELF files"; +const char *test_author = "Michal Clapinski "; + +void check_offset(int fd) +{ + int offset = lseek(fd, 0, SEEK_CUR); + if (offset < 0) { + fail("lseek"); + exit(1); + } + if (offset != 0) { + fail("wrong offset; expected: 0, got: %d", offset); + exit(1); + } +} + +int main(int argc, char **argv) +{ + int fd; + + test_init(argc, argv); + + fd = open("/proc/self/exe", O_RDONLY); + if (fd < 0) { + fail("open"); + exit(1); + } + check_offset(fd); + + test_daemon(); + test_waitsig(); + + check_offset(fd); + + pass(); + return 0; +} From 711775f401c8a4078fb38444a87d440f3fb1cb96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 13:31:34 +0200 Subject: [PATCH 174/321] zdtm: cgroup_ifpriomap: Improve skip check's robustness. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup_ifpriomap test needs net_prio cgroup, which might not be available. Make the .checkskip script check it. Signed-off-by: MichaÅ‚ MirosÅ‚aw --- test/zdtm/static/cgroup_ifpriomap.checkskip | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/cgroup_ifpriomap.checkskip b/test/zdtm/static/cgroup_ifpriomap.checkskip index 205f8fc530..f401ad1b24 100755 --- a/test/zdtm/static/cgroup_ifpriomap.checkskip +++ b/test/zdtm/static/cgroup_ifpriomap.checkskip @@ -1,3 +1,6 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +grep -q '^net_prio\s' /proc/cgroups From df24fe819dc23caa3e96df9083649560178d6a08 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 9 Oct 2023 15:55:25 +0100 Subject: [PATCH 175/321] lib: use separate packages for pycriu and crit Newer versions of pip use an isolated virtual environment when building Python projects. However, when the source code of CRIT is copied into the isolated environment, the symlink for `../lib/py` (pycriu) becomes invalid. As a workaround, we used the `--no-build-isolation` option for `pip install`. However, this functionality has issues in some versions of PIP [1, 2]. To fix this problem, this patch adds separate packages for pycriu and crit, and each package is installed independently. [1] https://github.com/pypa/pip/pull/8221 [2] https://github.com/pypa/pip/issues/8165#issuecomment-625401463 Signed-off-by: Radostin Stoyanov --- Makefile | 16 ++++++--- Makefile.install | 7 +++- coredump/pycriu | 2 +- crit/.gitignore | 2 ++ crit/Makefile | 40 ++++++++++++++++++++++ crit/crit/__init__.py | 1 + lib/py/cli.py => crit/crit/__main__.py | 20 +++++------ crit/pycriu | 1 - crit/pyproject.toml | 23 +++++++++++-- crit/requirements.txt | 7 ---- crit/setup.cfg | 20 +++++++++++ crit/setup.py | 19 +++------- lib/.gitignore | 1 + lib/Makefile | 32 ++++++++--------- lib/{py => pycriu}/.gitignore | 1 + lib/{py => pycriu}/Makefile | 0 lib/{py => pycriu}/__init__.py | 2 +- lib/{py => pycriu}/criu.py | 0 lib/{py => pycriu}/images/.gitignore | 0 lib/{py => pycriu}/images/Makefile | 0 lib/{py => pycriu}/images/__init__.py | 0 lib/{py => pycriu}/images/images.py | 0 lib/{py => pycriu}/images/pb2dict.py | 0 lib/pyproject.toml | 19 ++++++++++ lib/setup.cfg | 16 +++++++++ crit/crit => lib/setup.py | 4 +-- test/others/env.sh | 11 ++++-- test/pycriu | 2 +- test/zdtm/static/socket-tcp-fin-wait1.hook | 2 +- 29 files changed, 182 insertions(+), 66 deletions(-) create mode 100644 crit/Makefile create mode 100644 crit/crit/__init__.py rename lib/py/cli.py => crit/crit/__main__.py (95%) delete mode 120000 crit/pycriu delete mode 100644 crit/requirements.txt create mode 100644 crit/setup.cfg create mode 100644 lib/.gitignore rename lib/{py => pycriu}/.gitignore (68%) rename lib/{py => pycriu}/Makefile (100%) rename lib/{py => pycriu}/__init__.py (67%) rename lib/{py => pycriu}/criu.py (100%) rename lib/{py => pycriu}/images/.gitignore (100%) rename lib/{py => pycriu}/images/Makefile (100%) rename lib/{py => pycriu}/images/__init__.py (100%) rename lib/{py => pycriu}/images/images.py (100%) rename lib/{py => pycriu}/images/pb2dict.py (100%) create mode 100644 lib/pyproject.toml create mode 100644 lib/setup.cfg rename crit/crit => lib/setup.py (55%) mode change 100755 => 100644 diff --git a/Makefile b/Makefile index 8f2c294d5a..432dce6775 100644 --- a/Makefile +++ b/Makefile @@ -164,7 +164,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: flog criu lib +all: flog criu lib crit .PHONY: all # @@ -298,9 +298,9 @@ clean mrproper: $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ - $(Q) $(MAKE) $(build)=lib $@ .PHONY: clean mrproper clean-amdgpu_plugin: @@ -347,6 +347,10 @@ amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin +crit: lib + $(Q) $(MAKE) -C crit +.PHONY: crit + # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with @@ -412,6 +416,7 @@ help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' + @echo ' * crit - Build crit' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @@ -435,11 +440,12 @@ lint: flake8 --config=scripts/flake8.cfg test/zdtm.py flake8 --config=scripts/flake8.cfg test/inhfd/*.py flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py - flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py - flake8 --config=scripts/flake8.cfg lib/py/images/images.py + flake8 --config=scripts/flake8.cfg lib/pycriu/images/pb2dict.py + flake8 --config=scripts/flake8.cfg lib/pycriu/images/images.py flake8 --config=scripts/flake8.cfg scripts/criu-ns flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py - flake8 --config=scripts/flake8.cfg crit/setup.py + flake8 --config=scripts/flake8.cfg crit/*.py + flake8 --config=scripts/flake8.cfg crit/crit/*.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump flake8 --config=scripts/flake8.cfg scripts/github-indent-warnings.py diff --git a/Makefile.install b/Makefile.install index c798637beb..6f5b31924d 100644 --- a/Makefile.install +++ b/Makefile.install @@ -37,6 +37,10 @@ install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib +install-crit: lib + $(Q) $(MAKE) $(build)=crit install +.PHONY: install-crit + install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu @@ -50,12 +54,13 @@ install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-criu install-compel install-amdgpu_plugin ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/coredump/pycriu b/coredump/pycriu index d13a8790a9..d1b6ed5c45 120000 --- a/coredump/pycriu +++ b/coredump/pycriu @@ -1 +1 @@ -../lib/py/ \ No newline at end of file +../lib/pycriu \ No newline at end of file diff --git a/crit/.gitignore b/crit/.gitignore index 810661179d..10c8ab1869 100644 --- a/crit/.gitignore +++ b/crit/.gitignore @@ -1,2 +1,4 @@ crit.egg-info/ build/ +dist/ +version.py diff --git a/crit/Makefile b/crit/Makefile new file mode 100644 index 0000000000..9a856db6d2 --- /dev/null +++ b/crit/Makefile @@ -0,0 +1,40 @@ +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES := 0 + +VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) + +all-y += ${VERSION_FILE} +cleanup-y += ${VERSION_FILE} + +${VERSION_FILE}: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ + +install: ${VERSION_FILE} +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" +else + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit +endif +else + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit +endif +.PHONY: install + +uninstall: +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" +else + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif +else + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif +.PHONY: uninstall diff --git a/crit/crit/__init__.py b/crit/crit/__init__.py new file mode 100644 index 0000000000..58f3ace6c0 --- /dev/null +++ b/crit/crit/__init__.py @@ -0,0 +1 @@ +from .version import __version__ diff --git a/lib/py/cli.py b/crit/crit/__main__.py similarity index 95% rename from lib/py/cli.py rename to crit/crit/__main__.py index a3a0870f85..e15327f503 100755 --- a/lib/py/cli.py +++ b/crit/crit/__main__.py @@ -5,6 +5,7 @@ import os import pycriu +from . import __version__ def inf(opts): @@ -41,9 +42,9 @@ def decode(opts): try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: - print("Unknown magic %#x.\n"\ - "Maybe you are feeding me an image with "\ - "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) + print("Unknown magic %#x.\n" + "Maybe you are feeding me an image with " + "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) sys.exit(1) if opts['pretty']: @@ -59,9 +60,9 @@ def encode(opts): try: img = json.load(inf(opts)) except UnicodeDecodeError: - print("Cannot read JSON.\n"\ - "Maybe you are feeding me an image with protobuf data? "\ - "Encode expects JSON input.", file=sys.stderr) + print("Cannot read JSON.\n" + "Maybe you are feeding me an image with protobuf data? " + "Encode expects JSON input.", file=sys.stderr) sys.exit(1) pycriu.images.dump(img, outf(opts, False)) @@ -131,7 +132,7 @@ def ftype_find_in_files(opts, ft, fid): if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] - except: + except Exception: files_img = [] if len(files_img) == 0: @@ -364,7 +365,7 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--version', action='version', version=pycriu.__version__) + parser.add_argument('--version', action='version', version=__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') @@ -374,8 +375,7 @@ def main(): 'decode', help='convert criu image from binary type to json') decode_parser.add_argument( '--pretty', - help= - 'Multiline with indents and some numerical fields in field-specific format', + help='Multiline with indents and some numerical fields in field-specific format', action='store_true') decode_parser.add_argument( '-i', diff --git a/crit/pycriu b/crit/pycriu deleted file mode 120000 index d13a8790a9..0000000000 --- a/crit/pycriu +++ /dev/null @@ -1 +0,0 @@ -../lib/py/ \ No newline at end of file diff --git a/crit/pyproject.toml b/crit/pyproject.toml index 019b0d8488..9089f0a394 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -1,3 +1,22 @@ [build-system] -# Minimum requirements for the build system to execute. -requires = ["setuptools", "wheel"] # PEP 508 specifications. +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "crit" +description = "CRiu Image Tool" +authors = [ + {name = "CRIU team", email = "criu@openvz.org"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[project.scripts] +crit = "crit.__main__:main" + +[tool.setuptools] +packages = ["crit"] + +[tool.setuptools.dynamic] +version = {attr = "crit.__version__"} diff --git a/crit/requirements.txt b/crit/requirements.txt deleted file mode 100644 index c27e6d4f0b..0000000000 --- a/crit/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -# We need pip version 20.1 or newer to correctly build with 'pycriu' symlink. -# - Building of local directories with pip 20.1 or newer is done in place, -# instead of a temporary location containing a copy of the directory tree. -# (https://github.com/pypa/pip/issues/7555) -pip>=20.1 -setuptools>=42.0.0 -wheel diff --git a/crit/setup.cfg b/crit/setup.cfg new file mode 100644 index 0000000000..fbc9a51439 --- /dev/null +++ b/crit/setup.cfg @@ -0,0 +1,20 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = crit +description = CRiu Image Tool +author = CRIU team +author_email = criu@openvz.org +license = GPLv2 +version = attr: crit.__version__ + +[options] +packages = crit +python_requires = >=3.6 + +[options.entry_points] +console_scripts = + crit = crit.__main__:main diff --git a/crit/setup.py b/crit/setup.py index 2f584678fe..618ac1de48 100644 --- a/crit/setup.py +++ b/crit/setup.py @@ -1,15 +1,6 @@ -from setuptools import setup, find_packages -import pycriu +#!/usr/bin/env python3 +import setuptools -setup( - name='crit', - version=pycriu.__version__, - description='CRiu Image Tool', - author='CRIU team', - author_email='criu@openvz.org', - license='GPLv2', - url='https://github.com/checkpoint-restore/criu', - packages=find_packages('.'), - scripts=['crit'], - install_requires=[], -) + +if __name__ == '__main__': + setuptools.setup() diff --git a/lib/.gitignore b/lib/.gitignore new file mode 100644 index 0000000000..a10181b800 --- /dev/null +++ b/lib/.gitignore @@ -0,0 +1 @@ +pycriu.egg-info/ diff --git a/lib/Makefile b/lib/Makefile index 32d238de4d..ae371e78e0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -28,17 +28,17 @@ lib-a: lib/c/$(CRIU_A) # # Python bindings. -lib/py/Makefile: ; -lib/py/%: .FORCE +lib/pycriu/Makefile: ; +lib/pycriu/%: .FORCE $(call msg-gen, $@) - $(Q) $(MAKE) $(build)=lib/py $@ + $(Q) $(MAKE) $(build)=lib/pycriu $@ lib-py: - $(Q) $(MAKE) $(build)=lib/py all + $(Q) $(MAKE) $(build)=lib/pycriu all .PHONY: lib-py clean-lib: $(Q) $(MAKE) $(build)=lib/c clean - $(Q) $(MAKE) $(build)=lib/py clean + $(Q) $(MAKE) $(build)=lib/pycriu clean .PHONY: clean-lib clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc @@ -59,17 +59,15 @@ install: lib-c lib-a lib-py lib/c/criu.pc.in $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " SKIP INSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" else - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install -r ./crit/requirements.txt - $(Q) $(PYTHON) -m pip install --no-build-isolation --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit + $(E) " INSTALL " pycriu + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib endif else - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install -r ./crit/requirements.txt - $(Q) $(PYTHON) -m pip install --no-build-isolation --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit + $(E) " INSTALL " pycriu + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib endif .PHONY: install @@ -84,14 +82,14 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " SKIP UNINSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" else - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit + $(E) " UNINSTALL" pycriu + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu endif else - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit + $(E) " UNINSTALL" pycriu + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu endif .PHONY: uninstall diff --git a/lib/py/.gitignore b/lib/pycriu/.gitignore similarity index 68% rename from lib/py/.gitignore rename to lib/pycriu/.gitignore index fba7e38649..111642787a 100644 --- a/lib/py/.gitignore +++ b/lib/pycriu/.gitignore @@ -1,3 +1,4 @@ +__pycache__ *_pb2.py *.pyc version.py diff --git a/lib/py/Makefile b/lib/pycriu/Makefile similarity index 100% rename from lib/py/Makefile rename to lib/pycriu/Makefile diff --git a/lib/py/__init__.py b/lib/pycriu/__init__.py similarity index 67% rename from lib/py/__init__.py rename to lib/pycriu/__init__.py index 44f66ffa42..2abcf029de 100644 --- a/lib/py/__init__.py +++ b/lib/pycriu/__init__.py @@ -1,4 +1,4 @@ from . import rpc_pb2 as rpc from . import images from .criu import * -from .version import __version__ +from .version import __version__ \ No newline at end of file diff --git a/lib/py/criu.py b/lib/pycriu/criu.py similarity index 100% rename from lib/py/criu.py rename to lib/pycriu/criu.py diff --git a/lib/py/images/.gitignore b/lib/pycriu/images/.gitignore similarity index 100% rename from lib/py/images/.gitignore rename to lib/pycriu/images/.gitignore diff --git a/lib/py/images/Makefile b/lib/pycriu/images/Makefile similarity index 100% rename from lib/py/images/Makefile rename to lib/pycriu/images/Makefile diff --git a/lib/py/images/__init__.py b/lib/pycriu/images/__init__.py similarity index 100% rename from lib/py/images/__init__.py rename to lib/pycriu/images/__init__.py diff --git a/lib/py/images/images.py b/lib/pycriu/images/images.py similarity index 100% rename from lib/py/images/images.py rename to lib/pycriu/images/images.py diff --git a/lib/py/images/pb2dict.py b/lib/pycriu/images/pb2dict.py similarity index 100% rename from lib/py/images/pb2dict.py rename to lib/pycriu/images/pb2dict.py diff --git a/lib/pyproject.toml b/lib/pyproject.toml new file mode 100644 index 0000000000..8eb4b7084d --- /dev/null +++ b/lib/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools", "protobuf<4.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pycriu" +description = "Python bindings for CRIU" +authors = [ + {name = "CRIU team", email = "criu@openvz.org"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[tool.setuptools] +packages = ["pycriu", "pycriu.images"] + +[tool.setuptools.dynamic] +version = {attr = "pycriu.__version__"} diff --git a/lib/setup.cfg b/lib/setup.cfg new file mode 100644 index 0000000000..23ee48dd5b --- /dev/null +++ b/lib/setup.cfg @@ -0,0 +1,16 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = pycriu +description = Python bindings for CRIU +author = CRIU team +author_email = criu@openvz.org +license = GPLv2 +version = attr: pycriu.__version__ + +[options] +packages = find: +python_requires = >=3.6 diff --git a/crit/crit b/lib/setup.py old mode 100755 new mode 100644 similarity index 55% rename from crit/crit rename to lib/setup.py index 3b15ca6545..618ac1de48 --- a/crit/crit +++ b/lib/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 +import setuptools -from pycriu import cli if __name__ == '__main__': - cli.main() + setuptools.setup() diff --git a/test/others/env.sh b/test/others/env.sh index 6d830fb58e..6fa2c9691b 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -1,8 +1,13 @@ #!/bin/sh -CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) +BASE_DIR="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/../../")" + +CRIU="${BASE_DIR}/criu/criu" criu=$CRIU -CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit) + +export PYTHONPATH="${BASE_DIR}/lib:${BASE_DIR}/crit:${PYTHONPATH-}" +CRIT="python3 -m crit" crit=$CRIT -CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump) + +CRIU_COREDUMP="${BASE_DIR}/coredump/coredump" criu_coredump=$CRIU_COREDUMP diff --git a/test/pycriu b/test/pycriu index d13a8790a9..d1b6ed5c45 120000 --- a/test/pycriu +++ b/test/pycriu @@ -1 +1 @@ -../lib/py/ \ No newline at end of file +../lib/pycriu \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-fin-wait1.hook b/test/zdtm/static/socket-tcp-fin-wait1.hook index 9dcd089991..30f8ce0710 100755 --- a/test/zdtm/static/socket-tcp-fin-wait1.hook +++ b/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import sys -sys.path.append("../crit") +sys.path.append("../lib") import pycriu import os, os.path From c474816f17577ff79da692ffec26535d0e2d1a9e Mon Sep 17 00:00:00 2001 From: Marcus Folkesson Date: Fri, 20 Oct 2023 08:10:35 +0200 Subject: [PATCH 176/321] Makefile: introduce ARCHCFLAGS for arch specific cflags Do not use $(USERCFLAGS) for anything other than what the user provide. Signed-off-by: Marcus Folkesson --- Makefile | 8 ++++---- test/zdtm/Makefile.inc | 8 ++++---- test/zdtm/lib/Makefile | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 432dce6775..35e96d5d7d 100644 --- a/Makefile +++ b/Makefile @@ -35,18 +35,18 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp endif ifeq ($(ARMV),8) # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif @@ -159,7 +159,7 @@ export GMON GMONLDOPT endif AFLAGS += -D__ASSEMBLY__ -CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index d345233154..2456260e66 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -23,12 +23,12 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 else ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) # To build aarch32 on armv8 Travis-CI (see criu Makefile) - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif endif @@ -40,7 +40,7 @@ endif PKG_CONFIG ?= pkg-config CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes -CFLAGS += $(USERCFLAGS) +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) CFLAGS += -D_GNU_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index b574e1d3e7..428d726d66 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -1,6 +1,6 @@ LIBDIR := . -CFLAGS += $(USERCFLAGS) +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) LIB := libzdtmtst.a From d88dcef7fa38e29475bcb31672859ef1ec369d35 Mon Sep 17 00:00:00 2001 From: sally kang Date: Mon, 27 Nov 2023 22:23:55 +0800 Subject: [PATCH 177/321] comple: correct the syscall number of bind on ARM64 In the compel/arch/arm/plugins/std/syscalls/syscall.def, the syscall number of bind on ARM64 should be 200 instead of 235 Signed-off-by: Sally Kang --- compel/arch/arm/plugins/std/syscalls/syscall.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 7489ee0c11..217e346a31 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -39,7 +39,7 @@ recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, str sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) -bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +bind 200 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) From 5e5675698cd39c25357d7794f41bb359580be410 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Nov 2023 08:27:29 +0000 Subject: [PATCH 178/321] ci: fix rawhide netlink error The rawhide netlink errors are fixed with a newer kernel than the default 6.2 available in Fedora 38. Signed-off-by: Adrian Reber --- scripts/ci/vagrant.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 328903f385..c0c8e88c1d 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -57,6 +57,11 @@ fedora-no-vdso() { } fedora-rawhide() { + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks + # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously + # installed this reboots the VM. + vagrant reload + ssh default uname -a # # Workaround the problem: # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected From 4213f168d55277b5bc9b22df9773dddee8aa39f0 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Nov 2023 12:30:42 +0000 Subject: [PATCH 179/321] test: check for btrfs in the current directory The old test was checking if '/' is btrfs but we should check if the current directory is btrfs. Signed-off-by: Adrian Reber --- test/jenkins/criu-fault.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 7f503e817a..4a6d55e6bf 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -9,7 +9,7 @@ prep ./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --report report -f h || fail # FIXME: fhandles looks broken on btrfs -grep -P "/.* / " /proc/self/mountinfo | grep -q btrfs || NOBTRFS=$? +findmnt --noheadings --target . | grep -q btrfs || NOBTRFS=$? if [ $NOBTRFS -eq 1 ] ; then ./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --pre 2 -f uns || fail fi From 9d3e71a7a9a3ad6c0f3e7fbfdc483af104d0463d Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Nov 2023 14:56:41 +0000 Subject: [PATCH 180/321] ci: switch to permissive selinux mode during test Signed-off-by: Adrian Reber --- scripts/ci/run-ci-tests.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 1aae555f76..e05ead6683 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -292,10 +292,18 @@ if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then make -C test/zdtm/ cleanout rm -rf test/dump setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu + if [ -d /sys/fs/selinux ]; then + # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. + selinuxmode=$(getenforce) + setenforce Permissive + fi # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" + if [ -d /sys/fs/selinux ]; then + setenforce "$selinuxmode" + fi setcap -r criu/criu else echo "Skipping unprivileged mode tests" From b17a73b2ee21d64a7cc59968f39e784618837397 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 21 Nov 2023 12:02:31 -0800 Subject: [PATCH 181/321] ci: fix codespell errors Signed-off-by: Andrei Vagin --- criu/net.c | 2 +- criu/pagemap-cache.c | 2 +- lib/pycriu/images/pb2dict.py | 2 +- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- scripts/nmk/scripts/main.mk | 2 +- test/zdtm/static/mntns_open.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/criu/net.c b/criu/net.c index e422e2f69b..7109e6876a 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3274,7 +3274,7 @@ int macvlan_ext_add(struct external *ext) /* * The setns() syscall (called by switch_ns()) can be extremely * slow. If we call it two or more times from the same task the - * kernel will synchonously go on a very slow routine called + * kernel will synchronously go on a very slow routine called * synchronize_rcu() trying to put a reference on old namespaces. * * To avoid doing this more than once we pre-create all the diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 00f088ff3f..09dbc6a363 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -115,7 +115,7 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) * fit in solid manner, iow -- either the whole vma fits * the cache window, either plain read is used. * - * The benefit (apart redusing the number of read() calls) + * The benefit (apart reducing the number of read() calls) * is to walk page tables less. */ if (!pagemap_cache_disabled && len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) { diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index fe41642d55..3f5f390e39 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -365,7 +365,7 @@ def pb2dict(pb, pretty=False, is_hex=False): def _dict2pb_cast(field, value): # Not considering TYPE_MESSAGE here, as repeated # and non-repeated messages need special treatment - # in this case, and are hadled separately. + # in this case, and are handled separately. if field.type == FD.TYPE_BYTES: return get_bytes_dec(field)(value) elif field.type == FD.TYPE_ENUM: diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 6d004247be..ef79e5ef42 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -1063,7 +1063,7 @@ static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest) * * Nodes compatibility are determined by: * 1. Comparing the node properties - * 2. Making sure iolink mappings to CPUs would be compabitle with existing iolink mappings in maps + * 2. Making sure iolink mappings to CPUs would be compatible with existing iolink mappings in maps * * If src_node and dest_node are mappable, then map_device will push the new mapping * for src_node -> dest_node into new_maps. diff --git a/scripts/nmk/scripts/main.mk b/scripts/nmk/scripts/main.mk index 493a164f88..7f11bda236 100644 --- a/scripts/nmk/scripts/main.mk +++ b/scripts/nmk/scripts/main.mk @@ -1,7 +1,7 @@ ifndef ____nmk_defined__main # -# Genaral inclusion statement +# General inclusion statement ifndef ____nmk_defined__include include $(__nmk_dir)include.mk diff --git a/test/zdtm/static/mntns_open.c b/test/zdtm/static/mntns_open.c index 7d8bbbaa4e..0430f5b998 100644 --- a/test/zdtm/static/mntns_open.c +++ b/test/zdtm/static/mntns_open.c @@ -17,7 +17,7 @@ #define CLONE_NEWNS 0x00020000 #endif -const char *test_doc = "Check that mnt_id is repsected"; +const char *test_doc = "Check that mnt_id is respected"; const char *test_author = "Pavel Emelianov "; #define MPTS_FILE "F" From 95975e081210f0871f2e7de583331e4802d2129a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 28 Nov 2023 13:18:23 +0000 Subject: [PATCH 182/321] docker-test: fix condition for max tries Replace a recursive call with a loop. Reported-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index 22d326a371..174c2e109b 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -87,27 +87,25 @@ print_logs () { } declare -i max_restore_container_tries=3 -current_iteration= restore_container () { CHECKPOINT_NAME=$1 - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { + for i in $(seq $max_restore_container_tries); do + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log && break + # FIXME: There is a race condition in docker/containerd that causes # docker to occasionally fail when starting a container from a # checkpoint immediately after the checkpoint has been created. # https://github.com/moby/moby/issues/42900 - if [ "$current_iteration" -gt "$max_restore_container_tries" ]; then + if grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log; then + echo "Retry container restore: $i/$max_restore_container_tries" + sleep 1; + else print_logs fi - grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log && { - ((current_iteration+=1)) - echo "Retry container restore: $current_iteration" - sleep 1; - restore_container "$CHECKPOINT_NAME" - } || - print_logs - } && current_iteration=0 + + done } # Scenario: Create multiple containers and checkpoint and restore them once From 0da1ab25719a39ccdc55e2fb126e822dcc0bfb76 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 29 Nov 2023 11:46:08 +0000 Subject: [PATCH 183/321] docker-test: downgrade docker to v24.0.7 Checkpoint/restore with version 25.0.0-beta.1 fails with the following error: $ docker start --checkpoint=c1 cr Error response from daemon: failed to create task for container: content digest fdb1054b00a8c07f08574ce52198c5501d1f552b6a5fb46105c688c70a9acb45: not found: unknown Release notes: https://github.com/moby/moby/discussions/46816 Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index 174c2e109b..7e7ef71973 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -15,10 +15,11 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - -# shellcheck source=/dev/null -. /etc/lsb-release +# checkpoint/restore is broken in Docker Engine (Community) version 25.0.0-beta.1 +# https://github.com/moby/moby/discussions/46816 +# Downgrade to the latest stable version. +VERSION_STRING=5:24.0.7-1~ubuntu.20.04~focal +./apt-install docker-ce=$VERSION_STRING docker-ce-cli=$VERSION_STRING containerd.io docker-buildx-plugin docker-compose-plugin # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json From 378da3b5930b1583987d79707d2581d36eace720 Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Wed, 6 Dec 2023 18:01:08 +0400 Subject: [PATCH 184/321] Makefile: Use common warnings settings for loongarch64 WARNINGS variable should be amended, not redefined. We still need, e.g., `-Wno-dangling-pointer` to build criu on loongarch64 with gcc13. Signed-off-by: Ivan A. Melnikov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 35e96d5d7d..7e6bb40d0d 100644 --- a/Makefile +++ b/Makefile @@ -127,7 +127,7 @@ WARNINGS := -rdynamic endif ifeq ($(ARCH),loongarch64) -WARNINGS := -Wno-implicit-function-declaration +WARNINGS += -Wno-implicit-function-declaration endif ifneq ($(GCOV),) From dc49eb4c6665c36d84099123ac53f9f60f318805 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 8 Dec 2023 17:36:32 +0100 Subject: [PATCH 185/321] tty: skip ioctl(TIOCSLCKTRMIOS) if possible If ioctl(TIOCSLCKTRMIOS) fails with EPERM it means that a CRIU process lacks of CAP_SYS_ADMIN capability. But we can use ioctl(TIOCGLCKTRMIOS) to *read* current ->termios_locked value from the kernel and if it's the same as we already have we can skip failing ioctl(TIOCSLCKTRMIOS) safely. Adrian has recently posted [1] a very good patch to allow ioctl(TIOCSLCKTRMIOS) for processes that have CAP_CHECKPOINT_RESTORE (right now it requires CAP_SYS_ADMIN). [1] https://lore.kernel.org/all/20231206134340.7093-1-areber@redhat.com/ Suggested-by: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/tty.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/criu/tty.c b/criu/tty.c index 9faf602f20..ae23094b7b 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -817,8 +817,26 @@ static int do_restore_tty_parms(void *arg, int fd, pid_t pid) * on termios too. Just to be on the safe side. */ - if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) - goto err; + if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) { + struct termios t; + + if (errno != EPERM) + goto err; + + memzero(&t, sizeof(t)); + if (ioctl(fd, TIOCGLCKTRMIOS, &t) < 0) { + pr_perror("Can't get tty locked params on %#x", p->tty_id); + goto err; + } + + /* + * The ioctl(TIOCSLCKTRMIOS) requires a CRIU process to be privileged + * in the init_user_ns, but if the current "termios_locked" value equal + * to the "termios_locked" value from the image, we can safely skip setting it. + */ + if (memcmp(&t, &p->tl, sizeof(struct termios)) != 0) + goto err; + } if ((p->has & HAS_TERMIOS) && ioctl(fd, TCSETS, &p->t) < 0) goto err; From 157306435d9324674690ee73d6380ba8fafd2dd0 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 30 Nov 2023 13:29:56 +0000 Subject: [PATCH 186/321] ci: do not use 'tail' for skip-file-rwx-check test Newer versions of 'tail' rely on inotify and after a restore 'tail' is unhappy with the state of inotify and just stops. This replaces 'tail' with a minimal shell based test (thanks Andrei). Signed-off-by: Adrian Reber --- test/others/skip-file-rwx-check/run.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh index 0803d78eca..0776ebf618 100755 --- a/test/others/skip-file-rwx-check/run.sh +++ b/test/others/skip-file-rwx-check/run.sh @@ -10,11 +10,11 @@ source ../env.sh make clean touch testfile chmod +w testfile -tail --follow testfile & -tailpid=$! -if ! "$criu" dump --tree=$tailpid --shell-job --verbosity=4 --log-file=dump.log +bash -c 'exec 3 Date: Thu, 30 Nov 2023 14:13:37 +0000 Subject: [PATCH 187/321] ci: fix centos-stream 9 ci errors The image has a too old version of nettle which does not work with gnutls. Just upgrade to the latest to make the error go away. Signed-off-by: Adrian Reber --- .cirrus.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 6a586d58b5..adaa9be334 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -37,6 +37,9 @@ task: dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto libdrm-devel + # The image has a too old version of nettle which does not work with gnutls. + # Just upgrade to the latest to make the error go away. + dnf -y upgrade nettle nettle-devel systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is From e7aca13ed34a0ec76a2978cfe21f5e137bd5874f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 30 Nov 2023 15:10:23 +0000 Subject: [PATCH 188/321] ci: disable non-root in user namespace test in container Signed-off-by: Adrian Reber --- scripts/ci/prepare-for-fedora-rawhide.sh | 1 + scripts/ci/run-ci-tests.sh | 11 ++++++++--- scripts/ci/vagrant.sh | 4 ++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index e31814a955..d812c5faa5 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -18,6 +18,7 @@ dnf install -y \ libnet-devel \ libnl3-devel \ libbsd-devel \ + libselinux-utils \ make \ procps-ng \ protobuf-c-devel \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index e05ead6683..ef7e869e03 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -288,11 +288,16 @@ ip net add test # Rootless tests # Check if cap_checkpoint_restore is supported and also if unshare -c is supported. -if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then +# +# Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312). +# This is a temporary workaround until fixed in the kernel. +# The kernel currently does not show correct device and inode numbers in /proc/pid/maps +# for stackable file systems. +if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true && [ ! -e /run/.containerenv ]; then make -C test/zdtm/ cleanout rm -rf test/dump setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu - if [ -d /sys/fs/selinux ]; then + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. selinuxmode=$(getenforce) setenforce Permissive @@ -301,7 +306,7 @@ if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" - if [ -d /sys/fs/selinux ]; then + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then setenforce "$selinuxmode" fi setcap -r criu/criu diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c0c8e88c1d..c8cf0be744 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -70,6 +70,10 @@ fedora-rawhide() { # ssh default 'sudo dnf remove -y crun || true' ssh default sudo dnf install -y podman runc + # Some tests in the container need selinux to be disabled. + # In the container it is not possible to change the state of selinux. + # Let's just disable it for this test run completely. + ssh default 'sudo setenforce Permissive' ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } From 61224f2b55251b30364a125e36c9ce134b4d627f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 20 Oct 2023 10:59:19 +0100 Subject: [PATCH 189/321] gitignore: remove historical left-over files In commit [1] was introduced a mechanism to auto-generate the files: sys-exec-tbl*.c, syscalls*.S, syscall-codes*.h, and syscall*.h. This commit also updated the gitignore rules to ignore auto-generated files. However, after commit [2], the path for these files has changed and the patterns specified in gitignore are no longer needed. [1] bbc2f133 (x86/build: generate syscalls-{64,32}.built-in.o) [2] 19fadee9 (compel: plugins,std -- Implement syscalls in std plugin) Reported-by: @felicitia Signed-off-by: Radostin Stoyanov --- .gitignore | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitignore b/.gitignore index 2f2ab20290..854657d1c1 100644 --- a/.gitignore +++ b/.gitignore @@ -25,12 +25,6 @@ images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest -criu/arch/*/sys-exec-tbl*.c -# x86 syscalls-table is not generated -!criu/arch/x86/sys-exec-tbl.c -criu/arch/*/syscalls*.S -criu/include/syscall-codes*.h -criu/include/syscall*.h criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h From 50aa6da65adbeea50152753b86fb02f22bb88a22 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 27 Dec 2023 20:33:30 -0800 Subject: [PATCH 190/321] make: fix compilation on alpine Starting with the musl v1.2.4~69, _GNU_SOURCE doesn't set _LARGEFILE64_SOURCE. Fixes #2313 Signed-off-by: Andrei Vagin --- Makefile | 1 + scripts/build/Dockerfile.alpine | 2 +- test/zdtm/Makefile.inc | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 7e6bb40d0d..ff0ca92db6 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,7 @@ export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_LARGEFILE64_SOURCE DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index cb746757a4..593e190315 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -47,6 +47,6 @@ RUN apk add \ # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml +RUN pip3 install junit_xml --break-system-packages RUN make -C test/zdtm diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 2456260e66..24f32c6068 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -41,7 +41,7 @@ PKG_CONFIG ?= pkg-config CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) -CFLAGS += -D_GNU_SOURCE +CFLAGS += -D_GNU_SOURCE -D_LARGEFILE64_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include ifeq ($(strip $(V)),) From cda1c5c95aaca5affa28daa59d7f25ed64007a78 Mon Sep 17 00:00:00 2001 From: robert Date: Sun, 7 Jan 2024 15:32:00 -0800 Subject: [PATCH 191/321] irmap: hardcode some more interesting paths Signed-off-by: robert --- criu/irmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/irmap.c b/criu/irmap.c index e12df5cb58..37d098db11 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -67,6 +67,7 @@ static struct irmap hints[] = { .path = "/var/log", .nr_kids = -1, }, + { .path = "/usr/share/dbus-1/services", .nr_kids = -1 }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, From 0416d81e4d78e39ca5cdc01aa34115a05a73c4c0 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Jan 2024 14:40:25 +0000 Subject: [PATCH 192/321] net: fix network unlock with iptables-nft When iptables-nft is used as backend for iptables, the rules for network locking are translated into the following nft rules: ``` $ iptables-restore-translate -f lock.txt add table ip filter add chain ip filter CRIU insert rule ip filter INPUT counter jump CRIU insert rule ip filter OUTPUT counter jump CRIU add rule ip filter CRIU mark 0xc114 counter accept add rule ip filter CRIU counter drop ``` These rules create the following chains: ``` table ip filter { # handle 1 chain CRIU { # handle 1 meta mark 0x0000c114 counter packets 16 bytes 890 accept # handle 6 counter packets 1 bytes 60 drop # handle 7 meta mark 0x0000c114 counter packets 0 bytes 0 accept # handle 8 counter packets 0 bytes 0 drop # handle 9 } chain INPUT { # handle 2 type filter hook input priority filter; policy accept; counter packets 8 bytes 445 jump CRIU # handle 3 counter packets 0 bytes 0 jump CRIU # handle 10 } chain OUTPUT { # handle 4 type filter hook output priority filter; policy accept; counter packets 9 bytes 505 jump CRIU # handle 5 counter packets 0 bytes 0 jump CRIU # handle 11 } } ``` In order to delete the CRIU chain, we need to first delete all four jump targets. Otherwise, `-X CRIU` would fail with the following error: iptables-restore v1.8.10 (nf_tables): line 5: CHAIN_DEL failed (Resource busy): chain CRIU Reported-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/net.c | 50 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/criu/net.c b/criu/net.c index 7109e6876a..b34c379bab 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3178,19 +3178,53 @@ static inline int nftables_network_unlock(void) #endif } +static int iptables_has_criu_jump_target(void) +{ + int fd, ret; + char *argv[4] = { "sh", "-c", "iptables -C INPUT -j CRIU", NULL }; + + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + fd = -1; + pr_perror("failed to open /dev/null, using log fd"); + } + + ret = cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL); + close_safe(&fd); + return ret; +} + static int iptables_network_unlock_internal(void) { - char conf[] = "*filter\n" - ":CRIU - [0:0]\n" - "-D INPUT -j CRIU\n" - "-D OUTPUT -j CRIU\n" - "-X CRIU\n" - "COMMIT\n"; + char delete_jump_targets[] = "*filter\n" + ":CRIU - [0:0]\n" + "-D INPUT -j CRIU\n" + "-D OUTPUT -j CRIU\n" + "COMMIT\n"; + + char delete_criu_chain[] = "*filter\n" + ":CRIU - [0:0]\n" + "-X CRIU\n" + "COMMIT\n"; + int ret = 0; - ret |= iptables_restore(false, conf, sizeof(conf) - 1); + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); if (kdat.ipv6) - ret |= iptables_restore(true, conf, sizeof(conf) - 1); + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + + /* For compatibility with iptables-nft backend, we need to make sure that all jump + * targets have been removed before deleting the CRIU chain. + */ + if (!iptables_has_criu_jump_target()) { + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + } + + ret |= iptables_restore(false, delete_criu_chain, sizeof(delete_criu_chain) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_criu_chain, sizeof(delete_criu_chain) - 1); return ret; } From e5f4d8c6fb97660eb42df1f47fbdcffdf5e0e958 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Jan 2024 18:07:59 +0000 Subject: [PATCH 193/321] test/nfconntrack: use nft or iptables-legacy nft does not support xtables compat expressions https://git.netfilter.org/nftables/commit/?id=79195a8cc9e9d9cf2d17165bf07ac4cc9d55539f Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.alpine | 1 + test/zdtm/static/Makefile | 8 +++--- ...nntrack.c => socket-tcp-ipt-nfconntrack.c} | 0 .../static/socket-tcp-ipt-nfconntrack.desc | 6 +++++ test/zdtm/static/socket-tcp-nfconntrack.desc | 1 - test/zdtm/static/socket-tcp-nft-nfconntrack.c | 1 + .../static/socket-tcp-nft-nfconntrack.desc | 7 +++++ test/zdtm/static/socket-tcp.c | 27 ++++++++++++++++--- 8 files changed, 44 insertions(+), 7 deletions(-) rename test/zdtm/static/{socket-tcp-nfconntrack.c => socket-tcp-ipt-nfconntrack.c} (100%) create mode 100644 test/zdtm/static/socket-tcp-ipt-nfconntrack.desc delete mode 100644 test/zdtm/static/socket-tcp-nfconntrack.desc create mode 120000 test/zdtm/static/socket-tcp-nft-nfconntrack.c create mode 100644 test/zdtm/static/socket-tcp-nft-nfconntrack.desc diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 593e190315..2c58c910e7 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -33,6 +33,7 @@ RUN make mrproper && date && make -j $(nproc) CC="$CC" && date RUN apk add \ ip6tables \ iptables \ + iptables-legacy \ nftables \ iproute2 \ tar \ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 07d3bc6e21..fb856d55b4 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -85,7 +85,8 @@ TST_NOFILE := \ socket-tcp4v6 \ socket-tcp-local \ socket-tcp-reuseport \ - socket-tcp-nfconntrack \ + socket-tcp-ipt-nfconntrack \ + socket-tcp-nft-nfconntrack \ socket-tcp6-local \ socket-tcp4v6-local \ socket-tcpbuf \ @@ -277,7 +278,7 @@ pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ - bpf_array + bpf_array endif ifneq ($(ARCH),arm) @@ -598,7 +599,8 @@ socket-tcpbuf6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp4v6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV4V6 socket-tcp-local: CFLAGS += -D ZDTM_TCP_LOCAL -socket-tcp-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_CONNTRACK +socket-tcp-ipt-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_IPT_CONNTRACK +socket-tcp-nft-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_NFT_CONNTRACK socket_listen6: CFLAGS += -D ZDTM_IPV6 socket_listen4v6: CFLAGS += -D ZDTM_IPV4V6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 diff --git a/test/zdtm/static/socket-tcp-nfconntrack.c b/test/zdtm/static/socket-tcp-ipt-nfconntrack.c similarity index 100% rename from test/zdtm/static/socket-tcp-nfconntrack.c rename to test/zdtm/static/socket-tcp-ipt-nfconntrack.c diff --git a/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc new file mode 100644 index 0000000000..53dd822854 --- /dev/null +++ b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'has_ipt_legacy', + 'flavor': 'h', + 'opts': '--tcp-established', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp-nfconntrack.desc b/test/zdtm/static/socket-tcp-nfconntrack.desc deleted file mode 100644 index add2513f81..0000000000 --- a/test/zdtm/static/socket-tcp-nfconntrack.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.c b/test/zdtm/static/socket-tcp-nft-nfconntrack.c new file mode 120000 index 0000000000..8cb60dd03a --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.desc b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc new file mode 100644 index 0000000000..38a4eb3897 --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc @@ -0,0 +1,7 @@ +{ + 'flavor': 'h', + 'feature': 'network_lock_nftables', + 'opts': '--tcp-established', + 'dopts': '--network-lock nftables', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index f6ef473853..9830c7860a 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -67,17 +67,38 @@ int main(int argc, char **argv) int val; socklen_t optlen; -#ifdef ZDTM_CONNTRACK +#ifdef ZDTM_IPT_CONNTRACK if (unshare(CLONE_NEWNET)) { pr_perror("unshare"); return 1; } if (system("ip link set up dev lo")) return 1; - if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + + if (system("iptables-legacy -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + return 1; + if (system("iptables-legacy -w -A INPUT -j DROP")) + return 1; + +#endif + +#ifdef ZDTM_NFT_CONNTRACK + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); return 1; - if (system("iptables -w -A INPUT -j DROP")) + } + if (system("ip link set up dev lo")) + return 1; + + if (system("nft add table ip filter")) return 1; + if (system("nft add chain ip filter INPUT")) + return 1; + if (system("nft add rule ip filter INPUT iifname \"lo\" ip protocol tcp ct state new,established counter accept")) + return 1; + if (system("nft add rule ip filter INPUT counter drop")) + return 1; + #endif #ifdef ZDTM_TCP_LOCAL From 8f4430d60d3de0306deac968ce0554c4c9b8e926 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jan 2024 18:09:28 +0000 Subject: [PATCH 194/321] net: add error messages for restore of nftables Show appropriate error messages when restore of nftables fails. Signed-off-by: Radostin Stoyanov --- criu/net.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/criu/net.c b/criu/net.c index b34c379bab..0f7280bb50 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2438,27 +2438,39 @@ static inline int do_restore_nftables(struct cr_img *img) off_t img_data_size; char *buf; - if ((img_data_size = img_raw_size(img)) < 0) + if ((img_data_size = img_raw_size(img)) < 0) { + pr_err("image size mismatch\n"); goto out; + } - if (read_img_str(img, &buf, img_data_size) < 0) + if (read_img_str(img, &buf, img_data_size) < 0) { + pr_err("Failed to read nftables data\n"); goto out; + } nft = nft_ctx_new(NFT_CTX_DEFAULT); - if (!nft) + if (!nft) { + pr_err("Failed to create nft context object\n"); goto buf_free_out; + } + + if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft)) { + pr_err("Failed to enable std/err output buffering\n"); + goto nft_ctx_free_out; + } - if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft) || #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) - nft_run_cmd_from_buffer(nft, buf, strlen(buf))) + if (nft_run_cmd_from_buffer(nft, buf, strlen(buf))) #elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) - nft_run_cmd_from_buffer(nft, buf)) + if (nft_run_cmd_from_buffer(nft, buf)) #else - { - BUILD_BUG_ON(1); - } + BUILD_BUG_ON(1); #endif + { + pr_err("nft command error:\n%s\n%s\n", + nft_ctx_get_error_buffer(nft), buf); goto nft_ctx_free_out; + } exit_code = 0; From 615e45eb9444cf4951d3396487b9f9223fcab4bf Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 25 Oct 2023 01:07:27 +0000 Subject: [PATCH 195/321] kerndat: check the PAGEMAP_SCAN ioctl PAGEMAP_SCAN is a new ioctl that allows to get page attributes in a more effeciant way than reading pagemap files. Signed-off-by: Andrei Vagin --- criu/cr-check.c | 10 ++++++ criu/include/kerndat.h | 1 + criu/include/pagemap_scan.h | 68 +++++++++++++++++++++++++++++++++++++ criu/kerndat.c | 20 +++++++++++ 4 files changed, 99 insertions(+) create mode 100644 criu/include/pagemap_scan.h diff --git a/criu/cr-check.c b/criu/cr-check.c index cb083b16ca..fea1ce674a 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1382,6 +1382,14 @@ static int check_ipv6_freebind(void) return 0; } +static int check_pagemap_scan(void) +{ + if (!kdat.has_pagemap_scan) + return -1; + + return 0; +} + static int (*chk_feature)(void); /* @@ -1502,6 +1510,7 @@ int cr_check(void) ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); ret |= check_ipv6_freebind(); + ret |= check_pagemap_scan(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1623,6 +1632,7 @@ static struct feature_list feature_list[] = { { "openat2", check_openat2 }, { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, + { "pagemap_scan", check_pagemap_scan }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index f5d409acbf..91dbd494b2 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -86,6 +86,7 @@ struct kerndat_s { struct __ptrace_rseq_configuration libc_rseq_conf; bool has_ipv6_freebind; bool has_membarrier_get_registrations; + bool has_pagemap_scan; }; extern struct kerndat_s kdat; diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h new file mode 100644 index 0000000000..0ad4c9bc0b --- /dev/null +++ b/criu/include/pagemap_scan.h @@ -0,0 +1,68 @@ +#ifndef __CR_PAGEMAP_SCAN_H__ +#define __CR_PAGEMAP_SCAN_H__ + +#ifndef PAGEMAP_SCAN +#include +#include "int.h" + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + u64 start; + u64 end; + u64 categories; +}; + +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + u64 size; + u64 flags; + u64 start; + u64 end; + u64 walk_end; + u64 vec; + u64 vec_len; + u64 max_pages; + u64 category_inverted; + u64 category_mask; + u64 category_anyof_mask; + u64 return_mask; +}; +#endif /* PAGEMAP_SCAN */ + +#endif /* __CR_PAGEMAP_SCAN_H__ */ diff --git a/criu/kerndat.c b/criu/kerndat.c index fef5a46c19..95e7226b2b 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -54,6 +54,7 @@ #include "memfd.h" #include "mount-v2.h" #include "util-caps.h" +#include "pagemap_scan.h" struct kerndat_s kdat = {}; volatile int dummy_var; @@ -74,6 +75,25 @@ static int check_pagemap(void) return -1; } + if (ioctl(fd, PAGEMAP_SCAN, NULL) == 0) { + pr_err("PAGEMAP_SCAN succeeded unexpectedly\n"); + return -1; + } else { + switch (errno) { + case EFAULT: + pr_debug("PAGEMAP_SCAN is supported\n"); + kdat.has_pagemap_scan = true; + break; + case EINVAL: + case ENOTTY: + pr_debug("PAGEMAP_SCAN isn't supported\n"); + break; + default: + pr_perror("PAGEMAP_SCAN failed with unexpected errno"); + return -1; + } + } + retry = 3; while (retry--) { ++dummy_var; From bfa9428dfa3752ca4c9c0b33b691709ecdb37a06 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 23 Oct 2023 18:52:13 +0000 Subject: [PATCH 196/321] page-cache: use the PAGEMAP_SCAN ioctl when it is available Signed-off-by: Andrei Vagin --- criu/include/mem.h | 4 +- criu/include/pagemap-cache.h | 13 +++- criu/include/shmem.h | 3 +- criu/mem.c | 112 +++++++++++++++++++++++------------ criu/pagemap-cache.c | 90 +++++++++++++++++++++------- criu/shmem.c | 23 ++++--- 6 files changed, 173 insertions(+), 72 deletions(-) diff --git a/criu/include/mem.h b/criu/include/mem.h index 03574ea3d7..3618c9cc3b 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -7,6 +7,7 @@ #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" +#include "pagemap-cache.h" struct parasite_ctl; struct vm_area_list; @@ -47,5 +48,6 @@ int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -bool should_dump_page(VmaEntry *vmae, u64 pme); + +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); #endif /* __CR_MEM_H__ */ diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index 1d8bbffaf6..875e69e560 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -1,10 +1,12 @@ #ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ +#include #include #include "int.h" #include "common/list.h" +#include "pagemap_scan.h" struct vma_area; @@ -15,9 +17,15 @@ typedef struct { unsigned long start; /* start of area */ unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ + int fd; /* file to read PMs from */ + u64 *map; /* local buffer */ size_t map_len; /* length of a buffer */ - int fd; /* file to read PMs from */ + + struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */ + size_t regs_len; /* actual length of regs */ + size_t regs_max_len; /* maximum length of regs */ + size_t regs_idx; /* current index in the regs array */ } pmc_t; #define PMC_INIT \ @@ -26,7 +34,8 @@ typedef struct { } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); -extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); +extern int pmc_fill(pmc_t *pmc, u64 start, u64 end); #endif /* __CR_PAGEMAP_H__ */ diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 813ef630ef..15cab11464 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -4,13 +4,14 @@ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" +#include "pagemap-cache.h" struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); -extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc); extern int fixup_sysv_shmems(void); extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); diff --git a/criu/mem.c b/criu/mem.c index 417e0a21de..f56ed826b3 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -99,7 +99,7 @@ static inline bool __page_in_parent(bool dirty) return opts.track_mem && opts.img_parent && !dirty; } -bool should_dump_page(VmaEntry *vmae, u64 pme) +static bool should_dump_entire_vma(VmaEntry *vmae) { /* * vDSO area must be always dumped because on restore @@ -107,30 +107,53 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; - /* - * In turn VVAR area is special and referenced from - * vDSO area by IP addressing (at least on x86) thus - * never ever dump its content but always use one provided - * by the kernel on restore, ie runtime VVAR area must - * be remapped into proper place.. - */ - if (vma_entry_is(vmae, VMA_AREA_VVAR)) - return false; - - /* - * Optimisation for private mapping pages, that haven't - * yet being COW-ed - */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) - return true; return false; } +/* + * should_dump_page returns vaddr if an addressed page has to be dumped. + * Otherwise, it returns an address that has to be inspected next. + */ +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +{ + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) + return -1; + + if (pmc->regs) { + while (1) { + if (pmc->regs_idx == pmc->regs_len) + return pmc->end; + if (vaddr < pmc->regs[pmc->regs_idx].end) + break; + pmc->regs_idx++; + } + if (vaddr < pmc->regs[pmc->regs_idx].start) + return pmc->regs[pmc->regs_idx].start; + if (softdirty) + *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + return vaddr; + } else { + u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) + return vaddr + PAGE_SIZE; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + if (softdirty) + *softdirty = pme & PME_SOFT_DIRTY; + return vaddr; + } + + return vaddr + PAGE_SIZE; + } +} + bool page_is_zero(u64 pme) { return __page_is_zero(pme); @@ -164,25 +187,30 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * the memory contents is present in the parent image set. */ -static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr, bool has_parent) { - u64 *at = &map[PAGE_PFN(*off)]; - unsigned long pfn, nr_to_scan; + unsigned long nr_scanned; unsigned long pages[3] = {}; + unsigned long vaddr; + bool dump_all_pages; int ret = 0; - nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; + dump_all_pages = should_dump_entire_vma(vma->e); - for (pfn = 0; pfn < nr_to_scan; pfn++) { - unsigned long vaddr; + nr_scanned = 0; + for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; + bool softdirty = false; + u64 next; int st; - if (!should_dump_page(vma->e, at[pfn])) + /* If dump_all_pages is true, should_dump_page is called to get pme. */ + next = should_dump_page(pmc, vma->e, vaddr, &softdirty); + if (!dump_all_pages && next != vaddr) { + vaddr = next - PAGE_SIZE; continue; - - vaddr = vma->e->start + *off + pfn * PAGE_SIZE; + } if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; @@ -194,7 +222,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { + if (has_parent && page_in_parent(softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -214,9 +242,8 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct pages[st]++; } - *off += pfn * PAGE_SIZE; - - cnt_add(CNT_PAGES_SCANNED, nr_to_scan); + *pvaddr = vaddr; + cnt_add(CNT_PAGES_SCANNED, nr_scanned); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); @@ -356,12 +383,20 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { - u64 off = 0; - u64 *map; + u64 vaddr; int ret; if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) return 0; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vma->e, VMA_AREA_VVAR)) + return 0; /* * To facilitate any combination of pre-dump modes to run after @@ -421,15 +456,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - map = pmc_get_map(pmc, vma); - if (!map) + if (pmc_get_map(pmc, vma)) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) - return add_shmem_area(item->pid->real, vma->e, map); - + return add_shmem_area(item->pid->real, vma->e, pmc); + vaddr = vma->e->start; again: - ret = generate_iovs(item, vma, pp, map, &off, has_parent); + ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 09dbc6a363..d9bd1bc86a 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -1,5 +1,6 @@ #include #include +#include #include "page.h" #include "pagemap-cache.h" @@ -22,6 +23,8 @@ #define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) +#define PAGE_REGIONS_MAX_NR 32768 + /* * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read * for a few vma-s for one read call, it returns incorrect data. @@ -50,10 +53,23 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->pid = pid; pmc->map_len = PAGEMAP_LEN(map_size); pmc->vma_head = vma_head; - - pmc->map = xmalloc(pmc->map_len); - if (!pmc->map) - goto err; + pmc->regs_max_len = PAGE_PFN(map_size); + if (pmc->regs_max_len > PAGE_REGIONS_MAX_NR) + pmc->regs_max_len = PAGE_REGIONS_MAX_NR; + pmc->regs_len = 0; + pmc->regs_idx = 0; + pmc->regs = NULL; + pmc->map = NULL; + + if (kdat.has_pagemap_scan) { + pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); + if (!pmc->regs) + goto err; + } else { + pmc->map = xmalloc(pmc->map_len); + if (!pmc->map) + goto err; + } if (pagemap_cache_disabled) pr_warn_once("The pagemap cache is disabled\n"); @@ -87,17 +103,11 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz return -1; } -static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) -{ - return &pmc->map[PAGE_PFN(addr - pmc->start)]; -} - static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) { unsigned long low = vma->e->start & PMC_MASK; unsigned long high = low + PMC_SIZE; size_t len = vma_area_len(vma); - size_t size_map; if (high > kdat.task_size) high = kdat.task_size; @@ -149,39 +159,79 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) pr_debug("\t%d: simple mode [l:%lx h:%lx]\n", pmc->pid, pmc->start, pmc->end); } + return pmc_fill(pmc, pmc->start, pmc->end); +} + +int pmc_fill(pmc_t *pmc, u64 start, u64 end) +{ + size_t size_map; + + pmc->start = start; + pmc->end = end; + size_map = PAGEMAP_LEN(pmc->end - pmc->start); BUG_ON(pmc->map_len < size_map); BUG_ON(pmc->fd < 0); - if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; + if (pmc->regs) { + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = pmc->start, + .end = pmc->end, + .vec = (long)pmc->regs, + .vec_len = pmc->regs_max_len, + .max_pages = 0, + /* + * Request pages that are in RAM or swap, excluding + * zero-filled and file-backed pages. + */ + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; + long ret; + + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); + if (ret == -1) { + pr_perror("PAGEMAP_SCAN"); + pmc_zap(pmc); + return -1; + } + pmc->regs_len = ret; + pmc->regs_idx = 0; + pmc->end = args.walk_end; + } else { + if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } } return 0; } -u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) +int pmc_get_map(pmc_t *pmc, const struct vma_area *vma) { /* Hit */ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) - return __pmc_get_map(pmc, vma->e->start); + return 0; /* Miss, refill the cache */ if (pmc_fill_cache(pmc, vma)) { pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end); - return NULL; + return -1; } - - /* Hit for sure */ - return __pmc_get_map(pmc, vma->e->start); + return 0; } void pmc_fini(pmc_t *pmc) { close_safe(&pmc->fd); xfree(pmc->map); + xfree(pmc->regs); pmc_reset(pmc); } diff --git a/criu/shmem.c b/criu/shmem.c index c13a39b660..9e3178352d 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,23 +206,28 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) +static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; + u64 vaddr; if (!is_shmem_tracking_en()) return; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); - for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { - if (!should_dump_page(vma, map[vma_pfn])) + for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { + bool softdirty = false; + u64 next; + + next = should_dump_page(pmc, vma, vaddr, &softdirty); + if (next != vaddr) { + vaddr = next - PAGE_SIZE; continue; + } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (map[vma_pfn] & PME_SOFT_DIRTY) + if (softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); - else if (page_is_zero(map[vma_pfn])) - set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } @@ -648,7 +653,7 @@ static int open_shmem(int pid, struct vma_area *vma) return -1; } -int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) +int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) { struct shmem_info *si; unsigned long size = vma->pgoff + (vma->end - vma->start); @@ -662,7 +667,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } @@ -679,7 +684,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } From 50190ae0ac25aeecbaa0688ab3d604340438a586 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 29 Nov 2023 23:25:44 +0000 Subject: [PATCH 197/321] pagemap-cache: add an ability to run tests without PAGEMAP_SCAN This change adds a new injectable fault (135) to disable PAGEMAP_SCAN and fault back to read pagemap files. Signed-off-by: Andrei Vagin --- criu/include/fault-injection.h | 1 + criu/pagemap-cache.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 69d670be93..fe75dfe860 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -19,6 +19,7 @@ enum faults { FI_HUGE_ANON_SHMEM_ID = 132, FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, + FI_DONT_USE_PAGEMAP_SCAN = 135, FI_MAX, }; diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index d9bd1bc86a..978a6b1aca 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -11,6 +11,7 @@ #include "vma.h" #include "mem.h" #include "kerndat.h" +#include "fault-injection.h" #undef LOG_PREFIX #define LOG_PREFIX "pagemap-cache: " @@ -61,7 +62,7 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->regs = NULL; pmc->map = NULL; - if (kdat.has_pagemap_scan) { + if (kdat.has_pagemap_scan && !fault_injected(FI_DONT_USE_PAGEMAP_SCAN)) { pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); if (!pmc->regs) goto err; From dfd7d63f4372e0abc75ff2dfb278f72ab0a27a75 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 22 Jan 2024 17:50:39 +0800 Subject: [PATCH 198/321] zdtm: socket-tcp-nft-nfconntrack: add a hook to the chain in nft case Let's use hooked nft chain which actually affects packets. Fixes: e5f4d8c6f ("test/nfconntrack: use nft or iptables-legacy") Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/socket-tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index 9830c7860a..bc20754963 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -92,7 +92,7 @@ int main(int argc, char **argv) if (system("nft add table ip filter")) return 1; - if (system("nft add chain ip filter INPUT")) + if (system("nft 'add chain ip filter INPUT { type filter hook input priority 0 ; }'")) return 1; if (system("nft add rule ip filter INPUT iifname \"lo\" ip protocol tcp ct state new,established counter accept")) return 1; From 63494735da79c576945ce19adc987d7f82a8b617 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 24 Jan 2024 10:39:00 +0000 Subject: [PATCH 199/321] criu-log: remove unused declaration This patch removes a leftover declaration for log_closedir() which has been removed in the following commit: dc80d6f125e1e919363a0b8f938b8679ff0dbc2b log: get rid of LOG_DIR_FD_OFF and opening cwd in log_init() Signed-off-by: Radostin Stoyanov --- criu/include/criu-log.h | 1 - 1 file changed, 1 deletion(-) diff --git a/criu/include/criu-log.h b/criu/include/criu-log.h index ae2f38489c..9d52fbdb17 100644 --- a/criu/include/criu-log.h +++ b/criu/include/criu-log.h @@ -26,7 +26,6 @@ extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(pid_t pid); -extern void log_closedir(void); extern int log_keep_err(void); extern char *log_first_err(void); From 07a090b245a3ab4f1f3509a77d3e49f43088c6a7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 23 Jan 2024 07:28:40 +0000 Subject: [PATCH 200/321] net: return bool with iptable_has_criu_jump_target To improve readability, this patch changes the return type of iptables_has_criu_jump_target() to a boolean, where 'true' indicates that iptables has CRIU jump target and 'false' indicates otherwise. Suggested-by: Pavel Tikhomirov Signed-off-by: Radostin Stoyanov --- criu/net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/criu/net.c b/criu/net.c index 0f7280bb50..b5c4a6ee32 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3190,7 +3190,7 @@ static inline int nftables_network_unlock(void) #endif } -static int iptables_has_criu_jump_target(void) +static bool iptables_has_criu_jump_target(void) { int fd, ret; char *argv[4] = { "sh", "-c", "iptables -C INPUT -j CRIU", NULL }; @@ -3203,7 +3203,7 @@ static int iptables_has_criu_jump_target(void) ret = cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL); close_safe(&fd); - return ret; + return !ret; } static int iptables_network_unlock_internal(void) @@ -3228,7 +3228,7 @@ static int iptables_network_unlock_internal(void) /* For compatibility with iptables-nft backend, we need to make sure that all jump * targets have been removed before deleting the CRIU chain. */ - if (!iptables_has_criu_jump_target()) { + if (iptables_has_criu_jump_target()) { ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); if (kdat.ipv6) ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); From a9cbdad76fa6daf95ea38bd1d5f912d260a84aed Mon Sep 17 00:00:00 2001 From: David Francis Date: Tue, 30 Jan 2024 14:59:48 -0500 Subject: [PATCH 201/321] plugin/amdgpu: Don't print error for "No such process" during resume During the late stages of restore, each process being resumed gets an ioctl call to KFD_CRIU_OP_RESUME. If the process has no kfd process info, this call with fail with -ESRCH. This is normal behaviour, so we shouldn't print an error message for it. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_plugin.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 32ff8f9364..3675353a77 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1999,7 +1999,10 @@ int amdgpu_plugin_resume_devices_late(int target_pid) args.op = KFD_CRIU_OP_RESUME; pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("restore late ioctl failed"); + if (errno == ESRCH) + pr_info("Pid %d has no kfd process info\n", target_pid); + else + pr_perror("restore late ioctl failed"); ret = -1; } From 639068ecab26096092d460d6bbed10bedb5da0c4 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 5 Feb 2024 11:17:40 +0800 Subject: [PATCH 202/321] plugin/amdgpu: Also don't print 'plugin failed' in criu We already don't treat it as error in the plugin itself, but after returning -1 from RESUME_DEVICES_LATE hook we print debug message in criu about failed plugin, let's return 0 instead. While on it let's replace ret to exit_code. Fixes: a9cbdad76 ("plugin/amdgpu: Don't print error for "No such process" during resume") Signed-off-by: Pavel Tikhomirov --- plugins/amdgpu/amdgpu_plugin.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 3675353a77..23253632df 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1985,7 +1985,7 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, amdgpu_plugin_update_vma int amdgpu_plugin_resume_devices_late(int target_pid) { struct kfd_ioctl_criu_args args = { 0 }; - int fd, ret = 0; + int fd, exit_code = 0; pr_info("Inside %s for target pid = %d\n", __func__, target_pid); @@ -1999,15 +1999,16 @@ int amdgpu_plugin_resume_devices_late(int target_pid) args.op = KFD_CRIU_OP_RESUME; pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - if (errno == ESRCH) + if (errno == ESRCH) { pr_info("Pid %d has no kfd process info\n", target_pid); - else + } else { pr_perror("restore late ioctl failed"); - ret = -1; + exit_code = -1; + } } close(fd); - return ret; + return exit_code; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) From 81f2c41df6ffee7e4962b1facb17c327676285cc Mon Sep 17 00:00:00 2001 From: Ramesh Errabolu Date: Fri, 10 Nov 2023 11:36:34 -0600 Subject: [PATCH 203/321] amdgpu_plugin: Refactor code in preparation to support C&R for DRM devices Add a new compilation unit to host symbols and methods that will be needed to C&R DRM devices. Refactor code that indicates support for C&R and checkpoints KFD and DRM devices Signed-off-by: Ramesh Errabolu --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 245 ++++-------------------- plugins/amdgpu/amdgpu_plugin_drm.c | 63 ++++++ plugins/amdgpu/amdgpu_plugin_drm.h | 22 +++ plugins/amdgpu/amdgpu_plugin_topology.c | 41 ++-- plugins/amdgpu/amdgpu_plugin_topology.h | 2 + plugins/amdgpu/amdgpu_plugin_util.c | 208 ++++++++++++++++++++ plugins/amdgpu/amdgpu_plugin_util.h | 106 ++++++++++ plugins/amdgpu/criu-amdgpu.proto | 18 +- 9 files changed, 460 insertions(+), 247 deletions(-) create mode 100644 plugins/amdgpu/amdgpu_plugin_drm.c create mode 100644 plugins/amdgpu/amdgpu_plugin_drm.h create mode 100755 plugins/amdgpu/amdgpu_plugin_util.c create mode 100755 plugins/amdgpu/amdgpu_plugin_util.h diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 64a923d388..5efa8fb0ba 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -28,7 +28,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc-c --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_topology.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 23253632df..60e04f9735 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -30,55 +30,14 @@ #include "files.h" #include "common/list.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" #include "img-streamer.h" #include "image.h" #include "cr_options.h" -#define AMDGPU_KFD_DEVICE "/dev/kfd" -#define PROCPIDMEM "/proc/%d/mem" -#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" -#define HSAKMT_SHM "/hsakmt_shared_mem" -#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" -#define HSAKMT_SEM "hsakmt_semaphore" - -#define KFD_IOCTL_MAJOR_VERSION 1 -#define MIN_KFD_IOCTL_MINOR_VERSION 8 - -#define IMG_KFD_FILE "amdgpu-kfd-%d.img" -#define IMG_RENDERD_FILE "amdgpu-renderD-%d.img" -#define IMG_PAGES_FILE "amdgpu-pages-%d-%04x.img" - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef LOG_PREFIX -#undef LOG_PREFIX -#endif -#define LOG_PREFIX "amdgpu_plugin: " - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif - -#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) - -#define SDMA_OPCODE_COPY 1 -#define SDMA_COPY_SUB_OPCODE_LINEAR 0 -#define SDMA_NOP 0 -#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) - -enum sdma_op_type { - SDMA_OP_VRAM_READ, - SDMA_OP_VRAM_WRITE, -}; - struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -89,143 +48,13 @@ struct vma_metadata { }; /************************************ Global Variables ********************************************/ -struct tp_system src_topology; -struct tp_system dest_topology; - -struct device_maps checkpoint_maps; -struct device_maps restore_maps; - -extern int fd_next; static LIST_HEAD(update_vma_info_list); -extern bool kfd_fw_version_check; -extern bool kfd_sdma_fw_version_check; -extern bool kfd_caches_count_check; -extern bool kfd_num_gws_check; -extern bool kfd_vram_size_check; -extern bool kfd_numa_check; -extern bool kfd_capability_check; - size_t kfd_max_buffer_size; /**************************************************************************************************/ -int write_fp(FILE *fp, const void *buf, const size_t buf_len) -{ - size_t len_write; - - len_write = fwrite(buf, 1, buf_len, fp); - if (len_write != buf_len) { - pr_perror("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); - return -EIO; - } - return 0; -} - -int read_fp(FILE *fp, void *buf, const size_t buf_len) -{ - size_t len_read; - - len_read = fread(buf, 1, buf_len, fp); - if (len_read != buf_len) { - pr_perror("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); - return -EIO; - } - return 0; -} - -/** - * @brief Open an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * The FILE * returned is already at the location of the first actual contents. - * - * @param path The file path - * @param write False for read, true for write - * @param size Size of actual contents - * @return FILE *if successful, NULL if failed - */ -FILE *open_img_file(char *path, bool write, size_t *size) -{ - FILE *fp = NULL; - int fd, ret; - - if (opts.stream) - fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); - else - fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); - - if (fd < 0) { - pr_perror("%s: Failed to open for %s", path, write ? "write" : "read"); - return NULL; - } - - fp = fdopen(fd, write ? "w" : "r"); - if (!fp) { - pr_perror("%s: Failed get pointer for %s", path, write ? "write" : "read"); - close(fd); - return NULL; - } - - if (write) - ret = write_fp(fp, size, sizeof(*size)); - else - ret = read_fp(fp, size, sizeof(*size)); - - if (ret) { - pr_perror("%s:Failed to access file size", path); - fclose(fp); - return NULL; - } - - pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); - return fp; -} - -/** - * @brief Write an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * - * @param path The file path - * @param buf pointer to data to be written - * @param buf_len size of buf - * @return 0 if successful. -errno on failure - */ -int write_img_file(char *path, const void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - size_t len = buf_len; - - fp = open_img_file(path, true, &len); - if (!fp) - return -errno; - - ret = write_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} - -int read_file(const char *file_path, void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - - fp = fopen(file_path, "r"); - if (!fp) { - pr_perror("Cannot fopen %s", file_path); - return -errno; - } - - ret = read_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} - /* Call ioctl, restarting if it is interrupted */ int kmtIoctl(int fd, unsigned long request, void *arg) { @@ -263,21 +92,21 @@ static void free_e(CriuKfd *e) static int allocate_device_entries(CriuKfd *e, int num_of_devices) { - e->device_entries = xmalloc(sizeof(DeviceEntry *) * num_of_devices); + e->device_entries = xmalloc(sizeof(KfdDeviceEntry *) * num_of_devices); if (!e->device_entries) { pr_err("Failed to allocate device_entries\n"); return -ENOMEM; } for (int i = 0; i < num_of_devices; i++) { - DeviceEntry *entry = xzalloc(sizeof(*entry)); + KfdDeviceEntry *entry = xzalloc(sizeof(*entry)); if (!entry) { pr_err("Failed to allocate entry\n"); return -ENOMEM; } - device_entry__init(entry); + kfd_device_entry__init(entry); e->device_entries[i] = entry; e->n_device_entries++; @@ -287,21 +116,21 @@ static int allocate_device_entries(CriuKfd *e, int num_of_devices) static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucket *bo_bucket_ptr) { - e->bo_entries = xmalloc(sizeof(BoEntry *) * num_bos); + e->bo_entries = xmalloc(sizeof(KfdBoEntry *) * num_bos); if (!e->bo_entries) { pr_err("Failed to allocate bo_info\n"); return -ENOMEM; } for (int i = 0; i < num_bos; i++) { - BoEntry *entry = xzalloc(sizeof(*entry)); + KfdBoEntry *entry = xzalloc(sizeof(*entry)); if (!entry) { pr_err("Failed to allocate botest\n"); return -ENOMEM; } - bo_entry__init(entry); + kfd_bo_entry__init(entry); e->bo_entries[i] = entry; e->n_bo_entries++; @@ -309,13 +138,13 @@ static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucke return 0; } -int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceEntry **deviceEntries) +int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDeviceEntry **deviceEntries) { uint32_t devinfo_index = 0; struct tp_node *node; list_for_each_entry(node, &sys->nodes, listm_system) { - DeviceEntry *devinfo = deviceEntries[devinfo_index++]; + KfdDeviceEntry *devinfo = deviceEntries[devinfo_index++]; devinfo->node_id = node->id; @@ -383,11 +212,11 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceE return 0; } -int devinfo_to_topology(DeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) +int devinfo_to_topology(KfdDeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) { for (int i = 0; i < num_devices; i++) { struct tp_node *node; - DeviceEntry *devinfo = devinfos[i]; + KfdDeviceEntry *devinfo = devinfos[i]; node = sys_add_node(sys, devinfo->node_id, devinfo->gpu_id); if (!node) @@ -549,7 +378,7 @@ struct thread_data { uint32_t gpu_id; pid_t pid; struct kfd_criu_bo_bucket *bo_buckets; - BoEntry **bo_entries; + KfdBoEntry **bo_entries; int drm_fd; int ret; int id; /* File ID used by CRIU to identify KFD image for this process */ @@ -557,8 +386,7 @@ struct thread_data { int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) { - struct stat st_kfd, st_dri_min; - char img_path[128]; + struct stat st_kfd; int ret = 0; pr_debug("Enter %s\n", __func__); @@ -568,27 +396,18 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) return ret; } - snprintf(img_path, sizeof(img_path), "/dev/dri/renderD%d", DRM_FIRST_RENDER_NODE); - - ret = stat(img_path, &st_dri_min); - if (ret == -1) { - pr_perror("stat error for %s", img_path); - return ret; - } - - if (major(st_buf->st_rdev) == major(st_kfd.st_rdev) || ((major(st_buf->st_rdev) == major(st_dri_min.st_rdev)) && - (minor(st_buf->st_rdev) >= minor(st_dri_min.st_rdev) && - minor(st_buf->st_rdev) >= DRM_FIRST_RENDER_NODE))) { + /* If input device is KFD return device as supported */ + if (major(st_buf->st_rdev) == major(st_kfd.st_rdev)) { pr_debug("Known non-regular mapping, kfd-renderD%d -> OK\n", minor(st_buf->st_rdev)); - pr_debug("AMD KFD(maj) = %d, DRI(maj,min) = %d:%d VMA Device fd(maj,min) = %d:%d\n", - major(st_kfd.st_rdev), major(st_dri_min.st_rdev), minor(st_dri_min.st_rdev), - major(st_buf->st_rdev), minor(st_buf->st_rdev)); - /* VMA belongs to kfd */ return 0; } - pr_perror("Can't handle the VMA mapping"); - return -ENOTSUP; + /* Determine if input is a DRM device and therefore is supported */ + ret = amdgpu_plugin_drm_handle_device_vma(fd, st_buf); + if (ret) + pr_perror("%s(), Can't handle VMAs of input device\n", __func__); + + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -655,8 +474,9 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, void *buffer, size_t buffer_size, - amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type) { uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; @@ -954,7 +774,7 @@ void *dump_bo_contents(void *_thread_data) goto exit; } - snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, true, &image_size); if (!bo_contents_fp) { pr_perror("Cannot fopen %s", img_path); @@ -1027,7 +847,7 @@ void *restore_bo_contents(void *_thread_data) max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : SDMA_LINEAR_COPY_MAX_SIZE - 1; - snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, false, &image_size); if (!bo_contents_fp) { pr_perror("Cannot fopen %s", img_path); @@ -1234,7 +1054,7 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd for (i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - BoEntry *boinfo = e->bo_entries[i]; + KfdBoEntry *boinfo = e->bo_entries[i]; boinfo->gpu_id = bo_bucket->gpu_id; boinfo->addr = bo_bucket->addr; @@ -1391,7 +1211,7 @@ int amdgpu_plugin_dump_file(int fd, int id) criu_render_node__pack(&rd, buf); - snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id); + snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); ret = write_img_file(img_path, buf, len); if (ret) { xfree(buf); @@ -1399,6 +1219,7 @@ int amdgpu_plugin_dump_file(int fd, int id) } xfree(buf); + /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; } @@ -1531,7 +1352,7 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) { struct kfd_criu_device_bucket *device_bucket; - DeviceEntry *devinfo = e->device_entries[entries_i]; + KfdDeviceEntry *devinfo = e->device_entries[entries_i]; struct tp_node *tp_node; if (!devinfo->gpu_id) @@ -1581,7 +1402,7 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int i = 0; i < args->num_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - BoEntry *bo_entry = e->bo_entries[i]; + KfdBoEntry *bo_entry = e->bo_entries[i]; bo_bucket->gpu_id = bo_entry->gpu_id; bo_bucket->addr = bo_entry->addr; @@ -1736,7 +1557,7 @@ int amdgpu_plugin_restore_file(int id) * TODO: Currently, this code will only work if this function is called for /dev/kfd * first as we assume restore_maps is already filled. Need to fix this later. */ - snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id); + snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); pr_info("Restoring RenderD %s\n", img_path); img_fp = open_img_file(img_path, false, &img_size); diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c new file mode 100644 index 0000000000..a48dc68f0f --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" + +#include "criu-amdgpu.pb-c.h" + +#include +#include + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + + +int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) +{ + char path[PATH_MAX]; + struct stat drm; + int ret = 0; + + snprintf(path, sizeof(path), AMDGPU_DRM_DEVICE, DRM_FIRST_RENDER_NODE); + ret = stat(path, &drm); + if (ret == -1) { + pr_err("Error in getting stat for: %s", path); + return ret; + } + + if ((major(st->st_rdev) != major(drm.st_rdev)) || + (minor(st->st_rdev) < minor(drm.st_rdev)) || + (minor(st->st_rdev) > DRM_LAST_RENDER_NODE)) { + pr_err("Can't handle VMA mapping of input device\n"); + return -ENOTSUP; + } + + pr_debug("AMD DRI(maj,min) = %d:%d VMA Device FD(maj,min) = %d:%d\n", + major(drm.st_rdev), minor(drm.st_rdev), + major(st->st_rdev), minor(st->st_rdev)); + + return 0; +} + + diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h new file mode 100644 index 0000000000..37009c8ba7 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -0,0 +1,22 @@ +#ifndef __AMDGPU_PLUGIN_DRM_H__ +#define __AMDGPU_PLUGIN_DRM_H__ + +#include +#include "common/list.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + + +/** + * Determines if VMA's of input file descriptor belong to amdgpu's + * DRM device and are therefore supported + */ +int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); + + +#endif /* __AMDGPU_PLUGIN_DRM_H__ */ + diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index ef79e5ef42..c5fa51fdab 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -16,35 +16,11 @@ #include "xmalloc.h" #include "kfd_ioctl.h" +#include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" #define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/" -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef COMPILE_TESTS -#undef pr_err -#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) -#undef pr_info -#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) -#undef pr_debug -#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) - -#undef pr_perror -#define pr_perror(format, arg...) \ - fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#endif - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif - /* User override options */ /* Skip firmware version check */ bool kfd_fw_version_check = true; @@ -840,6 +816,9 @@ void topology_free(struct tp_system *sys) list_del(&p2pgroup->listm_system); xfree(p2pgroup); } + + /* Update Topology as being freed */ + sys->parsed = false; } /** @@ -1461,3 +1440,15 @@ int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys, return ret; } + +int topology_gpu_count(struct tp_system *sys) +{ + struct tp_node *node; + int count = 0; + + list_for_each_entry(node, &sys->nodes, listm_system) + if (NODE_IS_GPU(node)) + count++; + return count; +} + diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index 9d99cda1c2..c890e3ddae 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -107,6 +107,8 @@ int topology_parse(struct tp_system *topology, const char *msg); int topology_determine_iolinks(struct tp_system *sys); void topology_print(const struct tp_system *sys, const char *msg); +int topology_gpu_count(struct tp_system *topology); + struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id); struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id); diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c new file mode 100755 index 0000000000..48ff705556 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -0,0 +1,208 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" + +#include +#include + +#include "criu-plugin.h" +#include "plugin.h" +#include "criu-amdgpu.pb-c.h" + +#include "img-streamer.h" +#include "image.h" +#include "cr_options.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + +/* Tracks number of device files that need to be checkpointed */ +static int dev_file_cnt = 0; + +/* Helper structures to encode device topology of SRC and DEST platforms */ +struct tp_system src_topology; +struct tp_system dest_topology; + +/* Helper structures to encode device maps during Checkpoint and Restore operations */ +struct device_maps checkpoint_maps; +struct device_maps restore_maps; + +bool checkpoint_is_complete() +{ + return (dev_file_cnt == 0); +} + +void decrement_checkpoint_count() +{ + dev_file_cnt--; +} + +void init_gpu_count(struct tp_system *topo) +{ + if (dev_file_cnt != 0) + return; + + /* We add ONE to include checkpointing of KFD device */ + dev_file_cnt = 1 + topology_gpu_count(topo); +} + +int read_fp(FILE *fp, void *buf, const size_t buf_len) +{ + size_t len_read; + + len_read = fread(buf, 1, buf_len, fp); + if (len_read != buf_len) { + pr_err("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); + return -EIO; + } + return 0; +} + +int write_fp(FILE *fp, const void *buf, const size_t buf_len) +{ + size_t len_write; + + len_write = fwrite(buf, 1, buf_len, fp); + if (len_write != buf_len) { + pr_err("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); + return -EIO; + } + return 0; +} + +/** + * @brief Open an image file + * + * We store the size of the actual contents in the first 8-bytes of + * the file. This allows us to determine the file size when using + * criu_image_streamer when fseek and fstat are not available. The + * FILE * returned is already at the location of the first actual + * contents. + * + * @param path The file path + * @param write False for read, true for write + * @param size Size of actual contents + * @return FILE *if successful, NULL if failed + */ +FILE *open_img_file(char *path, bool write, size_t *size) +{ + FILE *fp = NULL; + int fd, ret; + + if (opts.stream) + fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); + else + fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); + + if (fd < 0) { + pr_err("%s: Failed to open for %s", path, write ? "write" : "read"); + return NULL; + } + + fp = fdopen(fd, write ? "w" : "r"); + if (!fp) { + pr_err("%s: Failed get pointer for %s", path, write ? "write" : "read"); + return NULL; + } + + if (write) + ret = write_fp(fp, size, sizeof(*size)); + else + ret = read_fp(fp, size, sizeof(*size)); + + if (ret) { + pr_err("%s:Failed to access file size", path); + fclose(fp); + return NULL; + } + + pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); + return fp; +} + +int read_file(const char *file_path, void *buf, const size_t buf_len) +{ + int ret; + FILE *fp; + + fp = fopen(file_path, "r"); + if (!fp) { + pr_err("Cannot fopen %s", file_path); + return -errno; + } + + ret = read_fp(fp, buf, buf_len); + fclose(fp); /* this will also close fd */ + return ret; +} + + +/** + * @brief Write an image file + * + * We store the size of the actual contents in the first 8-bytes of the file. This allows us to + * determine the file size when using criu_image_streamer when fseek and fstat are not available. + * + * @param path The file path + * @param buf pointer to data to be written + * @param buf_len size of buf + * @return 0 if successful. -errno on failure + */ +int write_img_file(char *path, const void *buf, const size_t buf_len) +{ + int ret; + FILE *fp; + size_t len = buf_len; + + fp = open_img_file(path, true, &len); + if (!fp) + return -errno; + + ret = write_fp(fp, buf, buf_len); + fclose(fp); /* this will also close fd */ + return ret; +} + +void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) +{ + struct kfd_criu_bo_bucket *bo; + + pr_info("\n"); + for (int idx = 0; idx < bo_cnt; idx++) { + bo = &bo_list[idx]; + pr_info("\n"); + pr_info("%s(), %d. KFD BO Addr: %llx \n", __func__, idx, bo->addr); + pr_info("%s(), %d. KFD BO Size: %llx \n", __func__, idx, bo->size); + pr_info("%s(), %d. KFD BO Offset: %llx \n", __func__, idx, bo->offset); + pr_info("%s(), %d. KFD BO Restored Offset: %llx \n", __func__, idx, bo->restored_offset); + pr_info("%s(), %d. KFD BO Alloc Flags: %x \n", __func__, idx, bo->alloc_flags); + pr_info("%s(), %d. KFD BO Gpu ID: %x \n", __func__, idx, bo->gpu_id); + pr_info("%s(), %d. KFD BO Dmabuf FD: %x \n", __func__, idx, bo->dmabuf_fd); + pr_info("\n"); + } + pr_info("\n"); +} + + diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h new file mode 100755 index 0000000000..aacca3a28c --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -0,0 +1,106 @@ +#ifndef __AMDGPU_PLUGIN_UTIL_H__ +#define __AMDGPU_PLUGIN_UTIL_H__ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif + +#ifdef COMPILE_TESTS +#undef pr_err +#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) +#undef pr_info +#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) +#undef pr_debug +#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) + +#undef pr_perror +#define pr_perror(format, arg...) \ + fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#endif + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "amdgpu_plugin: " + +#ifdef DEBUG +#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) +#else +#define plugin_log_msg(fmt, ...) \ + { \ + } +#endif + + +/* Path where KFD device is surfaced */ +#define AMDGPU_KFD_DEVICE "/dev/kfd" + +/* Path where DRM devices are surfaced */ +#define AMDGPU_DRM_DEVICE "/dev/dri/renderD%d" + +/* Minimum version of KFD IOCTL's that supports C&R */ +#define KFD_IOCTL_MAJOR_VERSION 1 +#define MIN_KFD_IOCTL_MINOR_VERSION 8 + +/* Name of file having serialized data of KFD device */ +#define IMG_KFD_FILE "amdgpu-kfd-%d.img" + +/* Name of file having serialized data of KFD buffer objects (BOs) */ +#define IMG_KFD_PAGES_FILE "amdgpu-pages-%d-%04x.img" + +/* Name of file having serialized data of DRM device */ +#define IMG_DRM_FILE "amdgpu-renderD-%d.img" + +/* Name of file having serialized data of DRM device buffer objects (BOs) */ +#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img" + +/* Helper macros to Checkpoint and Restore a ROCm file */ +#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" +#define HSAKMT_SHM "/hsakmt_shared_mem" +#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" +#define HSAKMT_SEM "hsakmt_semaphore" + +/* Help macros to build sDMA command packets */ +#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) + +#define SDMA_OPCODE_COPY 1 +#define SDMA_COPY_SUB_OPCODE_LINEAR 0 +#define SDMA_NOP 0 +#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) + +enum sdma_op_type { + SDMA_OP_VRAM_READ, + SDMA_OP_VRAM_WRITE, +}; + +/* Helper structures to encode device topology of SRC and DEST platforms */ +extern struct tp_system src_topology; +extern struct tp_system dest_topology; + +/* Helper structures to encode device maps during Checkpoint and Restore operations */ +extern struct device_maps checkpoint_maps; +extern struct device_maps restore_maps; + +extern int fd_next; + +extern bool kfd_fw_version_check; +extern bool kfd_sdma_fw_version_check; +extern bool kfd_caches_count_check; +extern bool kfd_num_gws_check; +extern bool kfd_vram_size_check; +extern bool kfd_numa_check; +extern bool kfd_capability_check; + +int read_fp(FILE *fp, void *buf, const size_t buf_len); +int write_fp(FILE *fp, const void *buf, const size_t buf_len); +int read_file(const char *file_path, void *buf, const size_t buf_len); +int write_img_file(char *path, const void *buf, const size_t buf_len); +FILE *open_img_file(char *path, bool write, size_t *size); + +bool checkpoint_is_complete(); +void decrement_checkpoint_count(); +void init_gpu_count(struct tp_system *topology); + +void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list); + +#endif /* __AMDGPU_PLUGIN_UTIL_H__ */ diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 81d00d3ff1..078b676500 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -5,7 +5,7 @@ message dev_iolink { required uint32 node_to_id = 2; } -message device_entry { +message kfd_device_entry { required uint32 node_id = 1; required uint32 gpu_id = 2; required uint32 cpu_cores_count = 3; @@ -40,10 +40,10 @@ message device_entry { repeated dev_iolink iolinks = 32; } -message bo_entry { - required uint64 addr = 1; - required uint64 size = 2; - required uint64 offset = 3; +message kfd_bo_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; required uint32 alloc_flags = 4; required uint32 gpu_id = 5; } @@ -52,10 +52,10 @@ message criu_kfd { required uint32 pid = 1; required uint32 num_of_gpus = 2; required uint32 num_of_cpus = 3; - repeated device_entry device_entries = 4; - required uint64 num_of_bos = 5; - repeated bo_entry bo_entries = 6; - required uint32 num_of_objects = 7; + repeated kfd_device_entry device_entries = 4; + required uint64 num_of_bos = 5; + repeated kfd_bo_entry bo_entries = 6; + required uint32 num_of_objects = 7; required uint64 shared_mem_size = 8; required uint32 shared_mem_magic = 9; required bytes priv_data = 10; From 9d9ae2954d3f1758ea659ed8e614dd28e8a368dd Mon Sep 17 00:00:00 2001 From: Ramesh Errabolu Date: Fri, 10 Nov 2023 13:02:49 -0600 Subject: [PATCH 204/321] amdgpu_plugin: Refactor code used to implement Checkpoint Refactor code used to Checkpoint DRM devices. Code is moved into amdgpu_plugin_drm.c file which hosts various methods to checkpoint and restore a workload. Signed-off-by: Ramesh Errabolu --- plugins/amdgpu/amdgpu_plugin.c | 62 ++++++++++++++---------------- plugins/amdgpu/amdgpu_plugin_drm.c | 38 ++++++++++++++++++ plugins/amdgpu/amdgpu_plugin_drm.h | 6 +++ 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 60e04f9735..a579158d0d 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -49,6 +49,13 @@ struct vma_metadata { /************************************ Global Variables ********************************************/ +/** + * FD of KFD device used to checkpoint. On a multi-process + * tree the order of checkpointing goes from parent to child + * and so on - so saving the FD will not be overwritten + */ +static int kfd_checkpoint_fd; + static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; @@ -990,6 +997,10 @@ static int unpause_process(int fd) goto exit; } + // Reset the KFD FD + kfd_checkpoint_fd = -1; + sys_close_drm_render_devices(&src_topology); + exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); @@ -1181,44 +1192,25 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } + /* Initialize number of device files that will be checkpointed */ + init_gpu_count(&src_topology); + /* Check whether this plugin was called for kfd or render nodes */ if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { + /* This is RenderD dumper plugin, for now just save renderD * minor number to be used during restore. In later phases this * needs to save more data for video decode etc. */ - - CriuRenderNode rd = CRIU_RENDER_NODE__INIT; - struct tp_node *tp_node; - - pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id); - - tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); - if (!tp_node) { - pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev)); - - return -ENODEV; - } - - rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd.gpu_id) - return -ENODEV; - - len = criu_render_node__get_packed_size(&rd); - buf = xmalloc(len); - if (!buf) - return -ENOMEM; - - criu_render_node__pack(&rd, buf); - - snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); - ret = write_img_file(img_path, buf, len); - if (ret) { - xfree(buf); + ret = amdgpu_plugin_drm_dump_file(fd, id, &st); + if (ret) return ret; - } - xfree(buf); + /* Invoke unpause process if needed */ + decrement_checkpoint_count(); + if (checkpoint_is_complete()) { + ret = unpause_process(kfd_checkpoint_fd); + } /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; @@ -1315,11 +1307,15 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = write_img_file(img_path, buf, len); xfree(buf); + exit: - /* Restore all queues */ - unpause_process(fd); + /* Restore all queues if conditions permit */ + kfd_checkpoint_fd = fd; + decrement_checkpoint_count(); + if (checkpoint_is_complete()) { + ret = unpause_process(fd); + } - sys_close_drm_render_devices(&src_topology); xfree((void *)args.devices); xfree((void *)args.bos); xfree((void *)args.priv_data); diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index a48dc68f0f..689d620720 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -61,3 +61,41 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) } +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) +{ + CriuRenderNode rd = CRIU_RENDER_NODE__INIT; + struct tp_node *tp_node; + char path[PATH_MAX]; + unsigned char *buf; + int minor; + int len; + int ret; + + /* Get the topology node of the DRM device */ + minor = minor(drm->st_rdev); + tp_node = sys_get_node_by_render_minor(&src_topology, minor); + if (!tp_node) { + pr_err("Failed to find a device with minor number = %d\n", minor); + return -ENODEV; + } + + /* Get the GPU_ID of the DRM device */ + rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd.gpu_id) { + pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id); + return -ENODEV; + } + + len = criu_render_node__get_packed_size(&rd); + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + + criu_render_node__pack(&rd, buf); + + snprintf(path, sizeof(path), IMG_DRM_FILE, id); + ret = write_img_file(path, buf, len); + xfree(buf); + return ret; +} + diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h index 37009c8ba7..6f0c1a9a63 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.h +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -17,6 +17,12 @@ */ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); +/** + * Serialize meta-data about a particular DRM device, its number of BOs, + * etc into a file. The serialized filename has in it the value ID that + * is passed in as a parameter + */ +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); #endif /* __AMDGPU_PLUGIN_DRM_H__ */ From 66cab1f49e54791275fcc4d592fedf5825bd94d4 Mon Sep 17 00:00:00 2001 From: rahulk789 Date: Sun, 26 Nov 2023 14:31:42 +0530 Subject: [PATCH 205/321] sk-inet: Added IP_TTL socket option Signed-off-by: rahulk789 --- criu/sk-inet.c | 7 ++++++- images/sk-inet.proto | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 24e92a8521..b8154e8602 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -417,10 +417,12 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); ret |= dump_opt(sk, SOL_IP, IP_TOS, &ioe->tos); + ret |= dump_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); } ioe->has_freebind = ioe->freebind; ioe->has_pktinfo = !!ioe->pktinfo; ioe->has_tos = !!ioe->tos; + ioe->has_ttl = !!ioe->ttl; return ret; } @@ -817,7 +819,10 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); if (ioe->has_tos) ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); - } + if (ioe->has_ttl) + ret |= restore_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); + + } if (ioe->raw) ret |= restore_ip_raw_opts(sk, family, proto, ioe->raw); diff --git a/images/sk-inet.proto b/images/sk-inet.proto index 666326fa40..03a679e7fa 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -20,6 +20,7 @@ message ip_opts_entry { optional bool pktinfo = 5; optional uint32 tos = 6; + optional uint32 ttl = 7; } message inet_sk_entry { From a49d6db3ca9d3036332a4305dd2ae9d93af6d7ad Mon Sep 17 00:00:00 2001 From: rahulk789 Date: Sun, 26 Nov 2023 14:35:47 +0530 Subject: [PATCH 206/321] zdtm: Added tests for IP_TTL restore Signed-off-by: rahulk789 --- test/zdtm/static/sock_ip_opts00.c | 1 + 1 file changed, 1 insertion(+) diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c index d890410d89..cb464365d9 100644 --- a/test/zdtm/static/sock_ip_opts00.c +++ b/test/zdtm/static/sock_ip_opts00.c @@ -26,6 +26,7 @@ struct sk_opt { struct sk_opt sk_opts_v4[] = { { SOL_IP, IP_FREEBIND, IP_OPT_VAL }, { SOL_IP, IP_PKTINFO, IP_OPT_VAL }, + { SOL_IP, IP_TTL, 32 }, { SOL_IP, IP_TOS, IPTOS_TOS(IPTOS_THROUGHPUT) }, }; From 495081c20ba23dc82bc83684730d36b5a02fdce5 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 6 Feb 2024 12:04:07 +0800 Subject: [PATCH 207/321] sk-inet: fix codding style in restore_ip_opts Commit [1] introduced codding-style breackage, let's fix it. Fixes: 66cab1f49 ("sk-inet: Added IP_TTL socket option") [1] Signed-off-by: Pavel Tikhomirov --- criu/sk-inet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index b8154e8602..a6a767c73f 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -821,8 +821,7 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); if (ioe->has_ttl) ret |= restore_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); - - } + } if (ioe->raw) ret |= restore_ip_raw_opts(sk, family, proto, ioe->raw); From 6d37f9a4df8727e5beb7a9f5e0dc5165bcc33cea Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 9 Feb 2024 14:14:10 +0000 Subject: [PATCH 208/321] amdgpu_plugin: fix lint errors $ make lint ... # Do not append \n to pr_perror, pr_pwarn or fail ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' plugins/amdgpu/amdgpu_plugin.c: pr_perror("%s(), Can't handle VMAs of input device\n", __func__); ! git --no-pager grep -En '^\s*\.*);$' | grep -v '\\n' plugins/amdgpu/amdgpu_plugin_drm.c:45: pr_err("Error in getting stat for: %s", path); plugins/amdgpu/amdgpu_plugin_util.c:77: pr_err("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); plugins/amdgpu/amdgpu_plugin_util.c:89: pr_err("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); plugins/amdgpu/amdgpu_plugin_util.c:120: pr_err("%s: Failed to open for %s", path, write ? "write" : "read"); plugins/amdgpu/amdgpu_plugin_util.c:126: pr_err("%s: Failed get pointer for %s", path, write ? "write" : "read"); plugins/amdgpu/amdgpu_plugin_util.c:136: pr_err("%s:Failed to access file size", path); plugins/amdgpu/amdgpu_plugin_util.c:152: pr_err("Cannot fopen %s", file_path); make: *** [Makefile:470: lint] Error 1 Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 2 +- plugins/amdgpu/amdgpu_plugin_drm.c | 3 +-- plugins/amdgpu/amdgpu_plugin_util.c | 16 +++++++--------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index a579158d0d..a41469a509 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -412,7 +412,7 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) /* Determine if input is a DRM device and therefore is supported */ ret = amdgpu_plugin_drm_handle_device_vma(fd, st_buf); if (ret) - pr_perror("%s(), Can't handle VMAs of input device\n", __func__); + pr_perror("%s(), Can't handle VMAs of input device", __func__); return ret; } diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 689d620720..d54cd937d5 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -42,7 +42,7 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) snprintf(path, sizeof(path), AMDGPU_DRM_DEVICE, DRM_FIRST_RENDER_NODE); ret = stat(path, &drm); if (ret == -1) { - pr_err("Error in getting stat for: %s", path); + pr_err("Error in getting stat for: %s\n", path); return ret; } @@ -98,4 +98,3 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) xfree(buf); return ret; } - diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index 48ff705556..62e569fc85 100755 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -65,7 +65,7 @@ void init_gpu_count(struct tp_system *topo) return; /* We add ONE to include checkpointing of KFD device */ - dev_file_cnt = 1 + topology_gpu_count(topo); + dev_file_cnt = 1 + topology_gpu_count(topo); } int read_fp(FILE *fp, void *buf, const size_t buf_len) @@ -74,7 +74,7 @@ int read_fp(FILE *fp, void *buf, const size_t buf_len) len_read = fread(buf, 1, buf_len, fp); if (len_read != buf_len) { - pr_err("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); + pr_err("Unable to read file (read:%ld buf_len:%ld)\n", len_read, buf_len); return -EIO; } return 0; @@ -86,7 +86,7 @@ int write_fp(FILE *fp, const void *buf, const size_t buf_len) len_write = fwrite(buf, 1, buf_len, fp); if (len_write != buf_len) { - pr_err("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); + pr_err("Unable to write file (wrote:%ld buf_len:%ld)\n", len_write, buf_len); return -EIO; } return 0; @@ -117,13 +117,13 @@ FILE *open_img_file(char *path, bool write, size_t *size) fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); if (fd < 0) { - pr_err("%s: Failed to open for %s", path, write ? "write" : "read"); + pr_err("%s: Failed to open for %s\n", path, write ? "write" : "read"); return NULL; } fp = fdopen(fd, write ? "w" : "r"); if (!fp) { - pr_err("%s: Failed get pointer for %s", path, write ? "write" : "read"); + pr_err("%s: Failed get pointer for %s\n", path, write ? "write" : "read"); return NULL; } @@ -133,7 +133,7 @@ FILE *open_img_file(char *path, bool write, size_t *size) ret = read_fp(fp, size, sizeof(*size)); if (ret) { - pr_err("%s:Failed to access file size", path); + pr_err("%s:Failed to access file size\n", path); fclose(fp); return NULL; } @@ -149,7 +149,7 @@ int read_file(const char *file_path, void *buf, const size_t buf_len) fp = fopen(file_path, "r"); if (!fp) { - pr_err("Cannot fopen %s", file_path); + pr_err("Cannot fopen %s\n", file_path); return -errno; } @@ -204,5 +204,3 @@ void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) } pr_info("\n"); } - - From 7bd786d0d450511cfc8685e5dd9a0d21e260bbaa Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 12 Feb 2024 11:52:41 +0000 Subject: [PATCH 209/321] kerndat: check support for PAGE_IS_SOFT_DIRTY The commit introducing PAGE_IS_SOFT_DIRTY has not been merged in kernel v6.7.x. fs/proc/task_mmu: report SOFT_DIRTY bits through the PAGEMAP_SCAN ioctl https://github.com/torvalds/linux/commit/e6a9a2cbc13bf As a result, CRIU fails with the following error: Error (criu/pagemap-cache.c:199): pagemap-cache: PAGEMAP_SCAN: Invalid argument' Error (criu/pagemap-cache.c:225): pagemap-cache: Failed to fill cache for 63 (400000-402000)' This patch updates check_pagemap() in kerndat to check if PAGE_IS_SOFT_DIRTY is supported. Fixes: #2334 Signed-off-by: Radostin Stoyanov --- criu/kerndat.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 95e7226b2b..e3b378a9c7 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -63,6 +63,14 @@ static int check_pagemap(void) { int ret, fd, retry; u64 pfn = 0; + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); if (fd < 0) { @@ -75,15 +83,11 @@ static int check_pagemap(void) return -1; } - if (ioctl(fd, PAGEMAP_SCAN, NULL) == 0) { - pr_err("PAGEMAP_SCAN succeeded unexpectedly\n"); - return -1; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { + pr_debug("PAGEMAP_SCAN is supported\n"); + kdat.has_pagemap_scan = true; } else { switch (errno) { - case EFAULT: - pr_debug("PAGEMAP_SCAN is supported\n"); - kdat.has_pagemap_scan = true; - break; case EINVAL: case ENOTTY: pr_debug("PAGEMAP_SCAN isn't supported\n"); From cac03bec2c4ac5d986041a5c2872d78da04eb7a7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 18 Jan 2024 14:26:33 +0000 Subject: [PATCH 210/321] pb2dict: fix flake8 error This patch fixes the following flake8 error: python3 -m flake8 --config=scripts/flake8.cfg lib/pycriu/images/pb2dict.py lib/pycriu/images/pb2dict.py:361:43: E721 do not compare types, for exact checks use `is` / `is not`, for instance checks use `isinstance()` Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 3f5f390e39..d29fdf66ce 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -358,7 +358,10 @@ def pb2dict(pb, pretty=False, is_hex=False): else: d_val = _pb2dict_cast(field, value, pretty, is_hex) - d[field.name] = d_val.decode() if type(d_val) == bytes else d_val + try: + d[field.name] = d_val.decode() + except (UnicodeDecodeError, AttributeError): + d[field.name] = d_val return d From 8a22b15428f6c47a98e9dabeb2fcc7fb65910a86 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 9 Feb 2024 13:04:58 +0000 Subject: [PATCH 211/321] make: replace flake8 with ruff Ruff (https://github.com/astral-sh/ruff) is a Python linter written in Rust, designed to replace Flake8. It is significantly faster and actively maintained. In addition to replacing flake8 with ruff, this patch also creates separate makefile targets for ruff, shellcheck and codespell, so that they can be tested independently. RUFF_FLAGS can be used to specify options such as '--fix'. Example: make lint make ruff RUFF_FLAGS=--fix Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 4 +-- .github/workflows/lint.yml | 2 +- CONTRIBUTING.md | 2 +- Makefile | 37 ++++++++++++++---------- scripts/build/Dockerfile.alpine | 1 - scripts/build/Dockerfile.archlinux | 1 - scripts/build/Dockerfile.centos8 | 1 - scripts/ci/prepare-for-fedora-rawhide.sh | 1 - scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 4 +-- scripts/ruff.toml | 4 +++ 11 files changed, 33 insertions(+), 26 deletions(-) create mode 100644 scripts/ruff.toml diff --git a/.cirrus.yml b/.cirrus.yml index adaa9be334..72135590d9 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto libdrm-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel @@ -111,7 +111,7 @@ task: yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f52bce8123..4892594744 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,7 +14,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format ShellCheck + run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck - uses: actions/checkout@v2 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a70506bfbf..37965e5fba 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -59,7 +59,7 @@ Other conventions can be learned from the source code itself. In short, make sur Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. -The following command can be used to automatically run a code linter for Python files (flake8), Shell scripts (shellcheck), +The following command can be used to automatically run a code linter for Python files (ruff), Shell scripts (shellcheck), text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` diff --git a/Makefile b/Makefile index ff0ca92db6..6a17a30b5a 100644 --- a/Makefile +++ b/Makefile @@ -436,20 +436,23 @@ help: @echo ' amdgpu_plugin - Make AMD GPU plugin' .PHONY: help -lint: - flake8 --version - flake8 --config=scripts/flake8.cfg test/zdtm.py - flake8 --config=scripts/flake8.cfg test/inhfd/*.py - flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py - flake8 --config=scripts/flake8.cfg lib/pycriu/images/pb2dict.py - flake8 --config=scripts/flake8.cfg lib/pycriu/images/images.py - flake8 --config=scripts/flake8.cfg scripts/criu-ns - flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py - flake8 --config=scripts/flake8.cfg crit/*.py - flake8 --config=scripts/flake8.cfg crit/crit/*.py - flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py - flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump - flake8 --config=scripts/flake8.cfg scripts/github-indent-warnings.py +ruff: + @ruff --version + ruff ${RUFF_FLAGS} --config=scripts/ruff.toml \ + test/zdtm.py \ + test/inhfd/*.py \ + test/others/rpc/config_file.py \ + lib/pycriu/images/pb2dict.py \ + lib/pycriu/images/images.py \ + scripts/criu-ns \ + test/others/criu-ns/run.py \ + crit/*.py \ + crit/crit/*.py \ + scripts/uninstall_module.py \ + coredump/ coredump/coredump \ + scripts/github-indent-warnings.py + +shellcheck: shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install @@ -458,7 +461,11 @@ lint: shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh shellcheck -x test/others/action-script/*.sh + +codespell: codespell -S tags + +lint: ruff shellcheck codespell # Do not append \n to pr_perror, pr_pwarn or fail ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' # Do not use %m with pr_* or fail @@ -469,7 +476,7 @@ lint: ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files ! git --no-pager grep -E '\s+$$' \*.c \*.h -.PHONY: lint +.PHONY: lint ruff shellcheck codespell codecov: SHELL := $(shell which bash) codecov: diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 2c58c910e7..329d7791de 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -41,7 +41,6 @@ RUN apk add \ go \ e2fsprogs \ py-yaml \ - py3-flake8 \ py3-importlib-metadata \ asciidoctor diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index b9968e876b..4056514891 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -31,7 +31,6 @@ RUN pacman -Syu --noconfirm \ bash \ go \ python-yaml \ - flake8 \ asciidoctor \ python-junit-xml \ python-importlib-metadata \ diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index b065246744..a672123441 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -26,7 +26,6 @@ RUN yum install -y --allowerasing \ protobuf-c-devel \ protobuf-devel \ python3-devel \ - python3-flake8 \ python3-PyYAML \ python3-protobuf \ python3-pip \ diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index d812c5faa5..09085c403b 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -23,7 +23,6 @@ dnf install -y \ procps-ng \ protobuf-c-devel \ protobuf-devel \ - python3-flake8 \ python3-PyYAML \ python3-protobuf \ python3-junit_xml \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index ef7e869e03..2fdecbc973 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -4,7 +4,7 @@ set -x -e CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time flake8 libbsd-dev python3-yaml + libnl-route-3-dev time libbsd-dev python3-yaml libperl-dev pkg-config python3-protobuf python3-pip python3-importlib-metadata python3-junit.xml libdrm-dev) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c8cf0be744..4c1be35443 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -38,8 +38,8 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-flake8 python3-protobuf python3-importlib-metadata \ - python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel + protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ + rubygem-asciidoctor iptables libselinux-devel libbpf-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default cat /proc/cmdline diff --git a/scripts/ruff.toml b/scripts/ruff.toml new file mode 100644 index 0000000000..2b0385976e --- /dev/null +++ b/scripts/ruff.toml @@ -0,0 +1,4 @@ +# Ignore `E401` (import violations) in all `__init__.py` files +[lint.per-file-ignores] +"__init__.py" = ["F401"] + From da7e6d3e1ef86f7bfeec65b6bd0a1172be81a073 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 9 Feb 2024 13:47:55 +0000 Subject: [PATCH 212/321] criu-ns: fix lint error This patch fixes the following lint error: scripts/criu-ns:219:16: E713 [*] Test for membership should be `not in` The change in this patch is auto-generated with `ruff --fix`. Signed-off-by: Radostin Stoyanov --- scripts/criu-ns | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index 4c032aa140..5950d7c50e 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -216,7 +216,7 @@ def wrap_restore(): def get_varg(args): for i in range(1, len(sys.argv)): - if not sys.argv[i] in args: + if sys.argv[i] not in args: continue if i + 1 >= len(sys.argv): From 1120308733b7547bc9e3f6e2809ecde2d47515fd Mon Sep 17 00:00:00 2001 From: Stepan Pieshkin Date: Fri, 2 Feb 2024 07:03:05 +0000 Subject: [PATCH 213/321] cgroup: Add support for restoring a thread in a correct v1 cgroup Currently we have checkpoint/restore support only of cgroup v2 threaded controllers. Threads originating in cgroup v1 environments will be restored to the main thread's cgroup. This change extends the support for a cgroups v1. Signed-off-by: Stepan Pieshkin --- criu/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 67282f269e..6d1f74457d 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -427,10 +427,11 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const } /* - * Set the is_threaded flag if cgroup.type's value is threaded, - * ignore all other values. + * Set the is_threaded flag if cgroup.type's value is threaded + * or it is a cgroup v1 (it has a 'tasks' property). + * Ignore all other values. */ - if (!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) + if ((!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) || !strcmp("tasks", prop->name)) controller->is_threaded = true; pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); @@ -1922,7 +1923,7 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) if (ctrl->cnames[0][0] == 0) fstype = "cgroup2"; - pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); + pr_debug("\tMaking controller dir %s (%s), type %s\n", paux, opt, fstype); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); return -1; @@ -1985,6 +1986,7 @@ static int cgroupd(int sk) CgMemberEntry *ce = cg_set_entry->ctls[i]; char aux[PATH_MAX]; CgControllerEntry *ctrl = NULL; + const char *format; for (j = 0; j < n_controllers; j++) { CgControllerEntry *cur = controllers[j]; @@ -2008,7 +2010,8 @@ static int cgroupd(int sk) continue; aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); - snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path); + format = ctrl->cnames[0][0] ? "/%s/tasks" : "/%s/cgroup.threads"; + snprintf(aux + aux_off, sizeof(aux) - aux_off, format, ce->path); /* * Cgroupd runs outside of the namespaces so we don't From d553fad2dcbaf3aff0c41c4a7de61b4cb203784c Mon Sep 17 00:00:00 2001 From: Stepan Pieshkin Date: Fri, 2 Feb 2024 07:05:51 +0000 Subject: [PATCH 214/321] zdtm/static: check that cgroup layout of threads is preserved Co-developed-by: Stepan Pieshkin Signed-off-by: Stepan Pieshkin Signed-off-by: Michal Clapinski Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 3 + test/zdtm/static/cgroup_threads.c | 184 +++++++++++++++++++++++++++ test/zdtm/static/cgroup_threads.desc | 1 + test/zdtm/static/cgroup_threads.hook | 19 +++ 4 files changed, 207 insertions(+) create mode 100644 test/zdtm/static/cgroup_threads.c create mode 100644 test/zdtm/static/cgroup_threads.desc create mode 100755 test/zdtm/static/cgroup_threads.hook diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index fb856d55b4..548cefac28 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -402,6 +402,7 @@ TST_DIR = \ cgroup_ignore \ cgroup_stray \ cgroup_yard \ + cgroup_threads \ unlink_fstat04 \ unlink_fstat041 \ mntns_remap \ @@ -684,6 +685,8 @@ s390x_gs_threads: LDFLAGS += -pthread thread_different_uid_gid: LDLIBS += -pthread -lcap +cgroup_threads: LDFLAGS += -pthread + bpf_hash: LDLIBS += -lbpf bpf_array: LDLIBS += -lbpf diff --git a/test/zdtm/static/cgroup_threads.c b/test/zdtm/static/cgroup_threads.c new file mode 100644 index 0000000000..2c17e13a77 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.c @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroup layout of threads is preserved"; +const char *test_author = "MichaÅ‚ CÅ‚apiÅ„ski "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; +#define SUBNAME "subcg_threads" +#define SUBNAME2 SUBNAME "/subsubcg" + +#define exit_group(code) syscall(__NR_exit_group, code) + +static int cg_move(char *name) +{ + int cgfd, l; + char paux[256]; + + sprintf(paux, "%s/%s", dirname, name); + if (mkdir(paux, 0600)) { + pr_perror("Can't create %s", paux); + return -1; + } + + sprintf(paux, "%s/%s/tasks", dirname, name); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + return -1; + } + + l = write(cgfd, "0", 2); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + return -1; + } + + return 0; +} + +static int cg_check(char *name) +{ + int found = 0; + FILE *cgf; + char paux[256], aux[128]; + + cgf = fopen("/proc/thread-self/cgroup", "r"); + if (cgf == NULL) + return -1; + + sprintf(aux, "name=%s:/%s", cgname, name); + while (fgets(paux, sizeof(paux), cgf)) { + char *s; + + s = strchr(paux, ':') + 1; + s[strlen(s) - 1] = '\0'; + test_msg("CMP [%s] vs [%s]\n", s, aux); + if (!strcmp(s, aux)) { + found = 1; + break; + } + } + + fclose(cgf); + + return found ? 0 : -1; +} + +int th_sync[2], rst_sync[2]; + +void *thread_fn(void *args) +{ + int status = cg_move(SUBNAME2); + + if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { + pr_perror("write"); + exit_group(1); + } + + if (status == 0) { + if (read(rst_sync[0], &status, sizeof(status)) < 0) { + pr_perror("read"); + exit_group(1); + } + + status = cg_check(SUBNAME2); + if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { + pr_perror("write"); + exit_group(1); + } + } + + pthread_exit(0); +} + +int main(int argc, char **argv) +{ + int status, exit_code = 1; + pthread_t thread; + char aux[64]; + + test_init(argc, argv); + + /* + * Pipe to talk to the kid. + * First, it reports that it's ready (int), + * then it reports the restore status (int). + */ + + if (pipe(th_sync)) { + pr_perror("pipe"); + return 1; + } + + /* "Restore happened" pipe */ + if (pipe(rst_sync)) { + pr_perror("pipe"); + return 1; + } + + if (mkdir(dirname, 0700) < 0) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(aux, "none,name=%s", cgname); + if (mount("none", dirname, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto out_rd; + } + + if (cg_move(SUBNAME)) + goto out_rs; + + if (pthread_create(&thread, NULL, thread_fn, NULL)) { + pr_perror("Can't create a new thread"); + goto out_rs; + } + + status = -1; + read(th_sync[0], &status, sizeof(status)); + if (status != 0) { + pr_perror("Error moving into cgroups"); + close(rst_sync[0]); + goto out_rs; + } + + test_daemon(); + test_waitsig(); + + close(rst_sync[1]); + + status = -1; + if (read(th_sync[0], &status, sizeof(status)) < 0) { + pr_perror("read"); + goto out_rs; + } + if (status != 0) { + fail("child cg changed"); + goto out_rs; + } + + pass(); + exit_code = 0; + +out_rs: + umount(dirname); +out_rd: + rmdir(dirname); +out: + return exit_code; +} diff --git a/test/zdtm/static/cgroup_threads.desc b/test/zdtm/static/cgroup_threads.desc new file mode 100644 index 0000000000..3c6c4a7e22 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup_threads.hook b/test/zdtm/static/cgroup_threads.hook new file mode 100755 index 0000000000..f4b553d347 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.hook @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +tname=$(mktemp -d cgclean.XXXXXX) +trap 'rmdir "${tname}"' EXIT + +mount -t cgroup none $tname -o "none,name=zdtmtst" +trap 'umount "${tname}"; rmdir "${tname}"' EXIT + +echo "Cleaning $tname" + +rmdir "$tname/subcg_threads/subsubcg/" || true +rmdir "$tname/subcg_threads/" || true + +echo "Left there is:" +ls "$tname" From c98fefdf5def6a066f8788e33194a60eb14f7936 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 25 May 2022 20:04:59 +0300 Subject: [PATCH 215/321] compiler: add ALIGN_DOWN macro Signed-off-by: Mike Rapoport (IBM) --- include/common/compiler.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/common/compiler.h b/include/common/compiler.h index 1347b62362..3e66709f92 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -89,6 +89,7 @@ #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) #define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) +#define ALIGN_DOWN(x, a) ALIGN((x) - ((a) - 1), (a)) #define min(x, y) \ ({ \ From 0dba58ae2fb114b5ac1a136e2142ea2d8a535506 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 15 May 2022 10:04:16 +0300 Subject: [PATCH 216/321] compel: always pass user_fpregs_struct_t to compel_get_task_regs() All architectures create on-stack structure for floating point save area in compel_get_task_regs() if the caller passes NULL rather than a valid pointer. The only place that calls compel_get_task_regs() with NULL for floating point save area is parasite_start_daemon() and it is simpler to define this strucuture on stack of parasite_start_daemon(). The availability of floating point save data is required in parasite_start_daemon() to detect shadow stack presence early during parasite infection and will be used in later patches. Signed-off-by: Mike Rapoport (IBM) --- compel/arch/aarch64/src/lib/infect.c | 3 +-- compel/arch/arm/src/lib/infect.c | 3 +-- compel/arch/mips/src/lib/infect.c | 3 +-- compel/arch/ppc64/src/lib/infect.c | 3 +-- compel/arch/s390/src/lib/infect.c | 3 +-- compel/arch/x86/src/lib/infect.c | 3 +-- compel/src/lib/infect.c | 3 ++- 7 files changed, 8 insertions(+), 13 deletions(-) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index d0189f0039..812ba34a37 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -59,10 +59,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; struct iovec iov; int ret; diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index 7700f52caf..8b810a88f5 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -65,10 +65,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr } #define PTRACE_GETVFPREGS 27 -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *vfp, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *vfp = ext_regs ? ext_regs : &tmp; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index afa0f5ed5f..0e98aaee3f 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -119,10 +119,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 1603ac92e5..84c2b1d7c3 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -391,10 +391,9 @@ static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_stru return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; int ret; ret = __get_task_regs(pid, regs, fpregs); diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 3cd25e71d8..85dfc3a4d4 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -293,10 +293,9 @@ static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) /* * Prepare task registers for restart */ -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; struct iovec iov; int rewind; diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 88bdb4047e..2febbf3f72 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -345,10 +345,9 @@ static int corrupt_extregs(pid_t pid) return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, void *arg, unsigned long flags) { - user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping general registers for %d in %s mode\n", pid, user_regs_native(regs) ? "native" : "compat"); diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index b9a913fa1e..696daa7f1b 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -739,6 +739,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; struct infect_ctx *ictx = &ctl->ictx; + user_fpregs_struct_t ext_regs; /* * Get task registers before going daemon, since the @@ -746,7 +747,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) * while in daemon it is not such. */ - if (compel_get_task_regs(pid, &ctl->orig.regs, NULL, ictx->save_regs, ictx->regs_arg, ictx->flags)) { + if (compel_get_task_regs(pid, &ctl->orig.regs, &ext_regs, ictx->save_regs, ictx->regs_arg, ictx->flags)) { pr_err("Can't obtain regs for thread %d\n", pid); return -1; } From fc683cb01cf64aa43b1795f9f648091fea3ec2f1 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Mon, 21 Mar 2022 09:34:41 +0200 Subject: [PATCH 217/321] compel: shstk: save CET state when CPU supports it Signed-off-by: Mike Rapoport (IBM) --- .../arch/x86/src/lib/include/uapi/asm/cpu.h | 1 + .../arch/x86/src/lib/include/uapi/asm/fpu.h | 11 +++- .../src/lib/include/uapi/asm/infect-types.h | 3 + compel/arch/x86/src/lib/infect.c | 65 ++++++++++++++++++- compel/include/uapi/infect.h | 8 +++ criu/arch/x86/crtools.c | 17 +++++ images/core-x86.proto | 8 +++ 7 files changed, 111 insertions(+), 2 deletions(-) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h index 63ff83dbeb..11c50e0e56 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h @@ -244,6 +244,7 @@ enum cpuid_leafs { #define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (11 * 32 + 7) /* Shadow Stack */ #define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (11 * 32 + 10) /* Carry-Less Multiplication Double Quadword */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index 8c83dd9ae4..d595a68fce 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -245,6 +245,14 @@ struct pkru_state { uint32_t pad; } __packed; +/* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + uint64_t cet; /* user control-flow settings */ + uint64_t ssp; /* user shadow stack pointer */ +}; + /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. @@ -260,7 +268,7 @@ struct pkru_state { * Of course it was not ;-) Now using four pages... * */ -#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) +#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct) - sizeof(struct cet_user_state)) /* * cpu requires it to be 64 byte aligned @@ -276,6 +284,7 @@ struct xsave_struct { struct ymmh_struct ymmh; uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; + struct cet_user_state cet; } __aligned(FP_MIN_ALIGN_BYTES) __packed; struct xsave_struct_ia32 { diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h index b35504ff88..2619fe64a5 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -143,4 +143,7 @@ typedef struct xsave_struct user_fpregs_struct_t; */ #define __NR32_mmap __NR32_mmap2 +extern bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs); +#define compel_shstk_enabled __compel_shstk_enabled + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 2febbf3f72..aabb4f3715 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -26,6 +26,16 @@ #ifndef NT_X86_XSTATE #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ #endif + +#ifndef NT_X86_SHSTK +#define NT_X86_SHSTK 0x204 /* x86 shstk state */ +#endif + +#ifndef ARCH_SHSTK_STATUS +#define ARCH_SHSTK_STATUS 0x5005 +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#endif + #ifndef NT_PRSTATUS #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ #endif @@ -250,7 +260,49 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) // [1] Intel® 64 and IA-32 Architectures Software Developer's // Manual Volume 1: Basic Architecture // Section 13.6: Processor tracking of XSAVE-managed state - return get_task_fpregs(pid, xsave); + if (get_task_fpregs(pid, xsave)) + return -1; + } + + /* + * xsave may be on stack, if we don't clear it explicitly we get + * funky shadow stack state + */ + memset(&xsave->cet, 0, sizeof(xsave->cet)); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + unsigned long ssp = 0; + unsigned long features = 0; + + if (ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long)&features, ARCH_SHSTK_STATUS)) { + /* + * kernels that don't support shadow stack return + * -EINVAL + */ + if (errno == EINVAL) + return 0; + + pr_perror("shstk: can't get shadow stack status for %d", pid); + return -1; + } + + if (!(features & ARCH_SHSTK_SHSTK)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: can't get SSP for %d", pid); + return -1; + } + } + + xsave->cet.cet = features; + xsave->cet.ssp = ssp; + + pr_debug("%d: shstk: cet: %lx ssp: %lx\n", pid, xsave->cet.cet, xsave->cet.ssp); } return 0; @@ -697,3 +749,14 @@ unsigned long compel_task_size(void) { return TASK_SIZE; } + +bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return false; + + if (ext_regs->cet.cet & ARCH_SHSTK_SHSTK) + return true; + + return false; +} diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 3bd36dda15..848d36c577 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -182,4 +182,12 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); +#ifndef compel_shstk_enabled +static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + return false; +} +#define compel_shstk_enabled +#endif + #endif diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index 912a4348b9..e068a9a020 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -133,6 +133,14 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpre #undef assign_array #undef assign_xsave + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + UserX86CetEntry *cet = core->thread_info->fpregs->xsave->cet; + struct cet_user_state *regs = &fpregs->cet; + + cet->cet = regs->cet; + cet->ssp = regs->ssp; + } + return 0; } @@ -199,6 +207,13 @@ static int alloc_xsave_extends(UserX86XsaveEntry *xsave) goto err; } + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + xsave->cet = xzalloc(sizeof(UserX86CetEntry)); + if (!xsave->cet) + goto err; + user_x86_cet_entry__init(xsave->cet); + } + return 0; err: return -1; @@ -220,6 +235,8 @@ int arch_alloc_thread_info(CoreEntry *core) with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); if (with_xsave) sz += sizeof(UserX86XsaveEntry); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) + sz += sizeof(UserX86CetEntry); } m = xmalloc(sz); diff --git a/images/core-x86.proto b/images/core-x86.proto index 815cf21ff8..762418d73b 100644 --- a/images/core-x86.proto +++ b/images/core-x86.proto @@ -41,6 +41,11 @@ message user_x86_regs_entry { optional user_x86_regs_mode mode = 28 [default = NATIVE]; } +message user_x86_cet_entry { + required uint64 cet = 1[(criu).hex = true]; + required uint64 ssp = 2[(criu).hex = true]; +} + message user_x86_xsave_entry { /* standard xsave features */ required uint64 xstate_bv = 1; @@ -60,6 +65,9 @@ message user_x86_xsave_entry { /* Protected keys */ repeated uint32 pkru = 8; + /* CET */ + optional user_x86_cet_entry cet = 9; + /* * Processor trace (PT) and hardware duty cycling (HDC) * are supervisor state components and only managed by From eee22365078760e3f276fb79c94eeb0fe3774e11 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 15 May 2022 19:58:04 +0300 Subject: [PATCH 218/321] compel: infect: prepare parasite_service() for addition of CET support To support sigreturn with CET enabled parasite must rewind its stack before calling sigreturn so that shadow stack will be compatible with actual calling sequence. In addition, calling sigreturn from top level routine (__export_parasite_head_start) will significantly simplify the shadow stack manipulations required to execute sigreturn. For x86 make fini_sigreturn() return the stack pointer for the signal frame that will be used by sigreturn and propagate that return value up to __export_parasite_head_start. In non-daemon mode parasite_trap_cmd() returns non-positive value which allows to distinguish daemon and non-daemon mode and properly stop at int3 in non-daemon mode. Architectures other than x86 remain unchanged and will still call sigreturn from fini_sigreturn(). Signed-off-by: Mike Rapoport (IBM) --- compel/arch/x86/plugins/std/parasite-head.S | 14 +++++++++ .../x86/src/lib/include/uapi/asm/sigframe.h | 10 ++++++- compel/plugins/include/uapi/std/infect.h | 2 +- compel/plugins/std/infect.c | 30 +++++++++---------- criu/pie/restorer.c | 6 +++- 5 files changed, 44 insertions(+), 18 deletions(-) diff --git a/compel/arch/x86/plugins/std/parasite-head.S b/compel/arch/x86/plugins/std/parasite-head.S index 4fb38d1f14..42cad4808c 100644 --- a/compel/arch/x86/plugins/std/parasite-head.S +++ b/compel/arch/x86/plugins/std/parasite-head.S @@ -34,7 +34,21 @@ END(__export_parasite_head_start_compat) .code64 #endif +/* + * When parasite_service() runs in the daemon mode it will return the stack + * pointer for the sigreturn frame in %rax and we call sigreturn directly + * from here. + * Since a valid stack pointer is positive, it is safe to presume that + * return value <= 0 means that parasite_service() called parasite_trap_cmd() + * in non-daemon mode, and the parasite should stop at int3. + */ ENTRY(__export_parasite_head_start) call parasite_service + cmp $0, %rax + jle 1f + movq %rax, %rsp + movq $15, %rax + syscall +1: int $0x03 END(__export_parasite_head_start) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index ec8c156fa4..9a540694b2 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -203,13 +203,21 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) : "rdi"(new_sp) \ : "eax", "r8", "r9", "r10", "r11", "memory") -#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ +#define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ do { \ if ((rt_sigframe)->is_native) \ ARCH_RT_SIGRETURN_NATIVE(new_sp); \ else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) + +#define ARCH_RT_SIGRETURN_DUMP(new_sp, rt_sigframe) \ +do { \ + if ((rt_sigframe)->is_native) \ + return new_sp; \ + else \ + ARCH_RT_SIGRETURN_COMPAT(new_sp); \ +} while (0) /* clang-format off */ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, diff --git a/compel/plugins/include/uapi/std/infect.h b/compel/plugins/include/uapi/std/infect.h index 08a5a7a804..a729abbd2b 100644 --- a/compel/plugins/include/uapi/std/infect.h +++ b/compel/plugins/include/uapi/std/infect.h @@ -7,7 +7,7 @@ extern int parasite_get_rpc_sock(void); extern unsigned int __export_parasite_service_cmd; extern void *__export_parasite_service_args_ptr; -extern int __must_check parasite_service(void); +extern unsigned long __must_check parasite_service(void); /* * Must be supplied by user plugins. diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index 60b21d3132..034201320f 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -16,6 +16,10 @@ #include "rpc-pie-priv.h" +#ifndef ARCH_RT_SIGRETURN_DUMP +#define ARCH_RT_SIGRETURN_DUMP ARCH_RT_SIGRETURN +#endif + static int tsock = -1; static struct rt_sigframe *sigframe; @@ -79,12 +83,13 @@ static int __parasite_daemon_wait_msg(struct ctl_msg *m) /* Core infect code */ -static noinline void fini_sigreturn(unsigned long new_sp) +static noinline unsigned long fini_sigreturn(unsigned long new_sp) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_DUMP(new_sp, sigframe); + return new_sp; } -static int fini(void) +static unsigned long fini(void) { unsigned long new_sp; @@ -96,14 +101,14 @@ static int fini(void) sys_close(tsock); std_log_set_fd(-1); - fini_sigreturn(new_sp); + return fini_sigreturn(new_sp); BUG(); return -1; } -static noinline __used int noinline parasite_daemon(void *args) +static noinline __used unsigned long parasite_daemon(void *args) { struct ctl_msg m; int ret = -1; @@ -140,12 +145,10 @@ static noinline __used int noinline parasite_daemon(void *args) } out: - fini(); - - return 0; + return fini(); } -static noinline __used int parasite_init_daemon(void *data) +static noinline __used unsigned long parasite_init_daemon(void *data) { struct parasite_init_args *args = data; int ret; @@ -178,14 +181,11 @@ static noinline __used int parasite_init_daemon(void *data) } else goto err; - parasite_daemon(data); + return parasite_daemon(data); err: futex_set_and_wake(&args->daemon_connected, ret); - fini(); - BUG(); - - return -1; + return fini(); } #ifndef __parasite_entry @@ -203,7 +203,7 @@ static noinline __used int parasite_init_daemon(void *data) unsigned int __export_parasite_service_cmd = 0; void *__export_parasite_service_args_ptr = NULL; -int __used __parasite_entry parasite_service(void) +unsigned long __used __parasite_entry parasite_service(void) { unsigned int cmd = __export_parasite_service_cmd; void *args = __export_parasite_service_args_ptr; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 02971657ef..20c6801c5e 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -78,6 +78,10 @@ #define FALLOC_FL_PUNCH_HOLE 0x02 #endif +#ifndef ARCH_RT_SIGRETURN_RST +#define ARCH_RT_SIGRETURN_RST ARCH_RT_SIGRETURN +#endif + #define sys_prctl_safe(opcode, val1, val2, val3) \ ({ \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ @@ -631,7 +635,7 @@ static int restore_thread_common(struct thread_restore_args *args) static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sigframe) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_RST(new_sp, sigframe); } static int send_cg_set(int sk, int cg_set) From a09a0eb081b9d63ce0defbb128115a38aea9be28 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 15 May 2022 21:06:50 +0300 Subject: [PATCH 219/321] compel: shstk: prepare shadow stack signal frame When calling sigreturn with CET enabled, the kernel verifies that the shadow stack has proper address of sa_restorer and a "restore token". Normally, they pushed to the shadow stack when signal processing is started. Since compel calls sigreturn directly, the shadow stack should be updated to match the kernel expectations for sigreturn invocation. Add parasite_setup_shstk() that sets up the shadow stack with the address of __export_parasite_head_start as sa_restorer and with the required restore token. Signed-off-by: Mike Rapoport (IBM) --- .../src/lib/include/uapi/asm/infect-types.h | 4 ++ compel/arch/x86/src/lib/infect.c | 45 +++++++++++++++++++ compel/include/uapi/infect.h | 9 ++++ compel/src/lib/infect.c | 3 ++ 4 files changed, 61 insertions(+) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h index 2619fe64a5..b998c488c7 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -146,4 +146,8 @@ typedef struct xsave_struct user_fpregs_struct_t; extern bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs); #define compel_shstk_enabled __compel_shstk_enabled +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index aabb4f3715..a07b1c9f37 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -760,3 +760,48 @@ bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) return false; } + +int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +{ + pid_t pid = ctl->rpid; + unsigned long sa_restorer = ctl->parasite_ip; + unsigned long long ssp; + unsigned long token; + struct iovec iov; + + if (!compel_shstk_enabled(ext_regs)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: %d: cannot get SSP", pid); + return -1; + } + } + + /* The token is for 64-bit */ + token = ALIGN_DOWN(ssp, 8); + token |= (1UL << 63); + ssp = ALIGN_DOWN(ssp, 8) - 8; + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, token)) { + pr_perror("shstk: %d: failed to inject shadow stack token", pid); + return -1; + } + + ssp = ssp - sizeof(uint64_t); + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, sa_restorer)) { + pr_perror("shstk: %d: failed to inject restorer address", pid); + return -1; + } + + ssp = ssp + sizeof(uint64_t); + if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + pr_perror("shstk: %d: cannot write SSP", pid); + return -1; + } + + return 0; +} diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 848d36c577..cd62559097 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -190,4 +190,13 @@ static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) #define compel_shstk_enabled #endif +#ifndef parasite_setup_shstk +static inline int parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs) +{ + return 0; +} +#define parasite_setup_shstk parasite_setup_shstk +#endif + #endif diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 696daa7f1b..79d00c9a10 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -760,6 +760,9 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) return -1; + if (parasite_setup_shstk(ctl, &ext_regs)) + return -1; + if (parasite_init_daemon(ctl)) return -1; From dbab2766016ebdbcc5bc1cfd0b45a564e618ed5a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 15 May 2022 21:19:58 +0300 Subject: [PATCH 220/321] criu: shstk: add VMA_AREA_SHSTK flag The shadow stack VMAs require special care because they can only be created and populated using special system calls. Add VMA_AREA_SHSTK flag and set it for VMAs that are marked as "ss" in /proc/pid/smaps Signed-off-by: Mike Rapoport (IBM) --- criu/include/image.h | 3 +++ criu/proc_parse.c | 17 ++++++++++++++--- lib/pycriu/images/pb2dict.py | 1 + 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/criu/include/image.h b/criu/include/image.h index 9a275565f9..a17aae35c2 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -35,6 +35,8 @@ * - stack * the memory area is used in application stack so we * should be careful about guard page here + * - shadow stack + * the memory area is used by shadow stack * - vsyscall * special memory area injected into the task memory * space by the kernel itself, represent virtual syscall @@ -84,6 +86,7 @@ #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_SHSTK (1 << 15) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 2b94050350..55aefac7d7 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -118,7 +118,8 @@ bool handle_vma_plugin(int *fd, struct stat *stat) return true; } -static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) +static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, + int *shstk) { char *tok; @@ -162,6 +163,9 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) *io_pf = 1; + if (_vmflag_match(tok, "ss")) + *shstk = 1; + /* * Anything else is just ignored. */ @@ -172,14 +176,21 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { - __parse_vmflags(buf, flags, madv, io_pf); + int shstk = 0; + + __parse_vmflags(buf, flags, madv, io_pf, &shstk); } static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) { int io_pf = 0; + int shstk = 0; + + __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf, + &shstk); - __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf); + if (shstk) + vma_area->e->status |= VMA_AREA_SHSTK; /* * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index d29fdf66ce..0d1a246927 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -103,6 +103,7 @@ def _custom_conv(field): ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), ('VMA_AREA_MEMFD', 1 << 14), + ('VMA_AREA_SHSTK', 1 << 15), ('VMA_UNSUPP', 1 << 31), ] From 95896b4a197ac2ae9140a4356bcc1bccc8cca8d2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 24 May 2022 21:25:14 +0300 Subject: [PATCH 221/321] criu: shstk: premap and prepopulate shadow stack VMAs Shadow stack VMAs cannot be mmap()ed, they must be created using map_shadow_stack() system call and populated using special wrss instruction available only when shadow stack is enabled. Premap them to reserve virtual address space and populate it to have there contents available for later copying after enabling shadow stack. Along with the space required by shadow stack VMAs also reserve an extra page that will be later used as a temporary shadow stack. Signed-off-by: Mike Rapoport (IBM) --- criu/include/vma.h | 1 + criu/mem.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/criu/include/vma.h b/criu/include/vma.h index 4b663ee500..b8ddfc1422 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -106,6 +106,7 @@ static inline bool vma_entry_is_private(VmaEntry *entry, unsigned long task_size return (vma_entry_is(entry, VMA_AREA_REGULAR) && (vma_entry_is(entry, VMA_ANON_PRIVATE) || vma_entry_is(entry, VMA_FILE_PRIVATE)) && (entry->end <= task_size)) || + vma_entry_is(entry, VMA_AREA_SHSTK) || vma_entry_is(entry, VMA_AREA_AIORING); } diff --git a/criu/mem.c b/criu/mem.c index f56ed826b3..0236c5e1e9 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -741,6 +741,8 @@ int prepare_mm_pid(struct pstree_item *i) ri->vmas.rst_priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.rst_priv_size += PAGE_SIZE; + if (vma_area_is(vma, VMA_AREA_SHSTK)) + ri->vmas.rst_priv_size += PAGE_SIZE; } pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); @@ -882,6 +884,14 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void vma->e->start -= PAGE_SIZE; size = vma_entry_len(vma->e); + + /* + * map an extra page for shadow stack VMAs, it will be used as a + * temporary shadow stack + */ + if (vma_area_is(vma, VMA_AREA_SHSTK)) + size += PAGE_SIZE; + if (!vma_inherited(vma)) { int flag = 0; /* @@ -957,6 +967,15 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) { + /* + * Shadow stack VMAs cannot be mmap()ed, they must be created using + * map_shadow_stack() system call. + * Premap them to reserve virtual address space and populate them + * to have there contents available for later copying. + */ + if (vma_area_is(vma, VMA_AREA_SHSTK)) + return true; + /* * On kernels with 4K guard pages, growsdown VMAs * always have one guard page at the From 763d07a18d023d7806683b1cdf514be36ec0b0c4 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 25 May 2022 10:30:06 +0300 Subject: [PATCH 222/321] criu: shstk: prepare shadow stack parameters for restorer blob Shadow stacks must be populated using special WRSS instruction. This instruction is only available when shadow stack is enabled, calling it with disabled shadow stack causes #UD. Moreover, shadow stack VMAs cannot be mremap()ed and they must be created using map_shadow_stack() system call. This requires delaying the restore of shadow stacks to restorer blob after the CRIU mappings are cleared. Introduce rst_shstk_info structure to hold shadow stack parameters required in the restorer blob and populate this structure in arch_prepare_shstk() method. Signed-off-by: Mike Rapoport (IBM) Signed-off-by: Andrei Vagin --- criu/arch/x86/Makefile | 1 + criu/arch/x86/include/asm/restorer.h | 1 + criu/arch/x86/include/asm/shstk.h | 69 +++++++++++++++++++++ criu/arch/x86/shstk.c | 90 ++++++++++++++++++++++++++++ criu/cr-restore.c | 3 + criu/include/restore.h | 13 ++++ criu/include/restorer.h | 8 +++ 7 files changed, 185 insertions(+) create mode 100644 criu/arch/x86/include/asm/shstk.h create mode 100644 criu/arch/x86/shstk.c diff --git a/criu/arch/x86/Makefile b/criu/arch/x86/Makefile index 618e85bb3e..46f00e9e93 100644 --- a/criu/arch/x86/Makefile +++ b/criu/arch/x86/Makefile @@ -9,6 +9,7 @@ obj-y += cpu.o obj-y += crtools.o obj-y += kerndat.o obj-y += sigframe.o +obj-y += shstk.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o endif diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index f7a6d50589..3a673958d1 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -8,6 +8,7 @@ #include #include #include "asm/compat.h" +#include "asm/shstk.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h new file mode 100644 index 0000000000..a81062010c --- /dev/null +++ b/criu/arch/x86/include/asm/shstk.h @@ -0,0 +1,69 @@ +#ifndef __CR_ASM_SHSTK_H__ +#define __CR_ASM_SHSTK_H__ + +/* + * Shadow stack constants from Linux + */ +/* arch/x86/include/uapi/asm/mman.h */ +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +/* arch/x86/include/uapi/asm/prctl.h */ +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + +#define ARCH_HAS_SHSTK + +/* from arch/x86/kernel/shstk.c */ +#define SHSTK_DATA_BIT (1UL << 63) /* BIT(63) */ + +/* + * Shadow stack memory cannot be restored with memcpy/pread but only using + * a special instruction that can write to shadow stack. + * That instruction is only available when shadow stack is enabled, + * otherwise it causes #UD. + * + * Also, shadow stack VMAs cannot be mmap()ed or mrepmap()ed, they must be + * created using map_shadow_stack() system call. This pushes creation of + * shadow stack VMAs to the restorer blob after CRIU mappings are freed. + * + * And there is an additional jungling with shadow stacks to ensure that we + * don't unmap an active shadow stack + * + * The overall sequence of restoring shadow stack is + * - Enable shadow stack early after clone()ing the task + * - Unlock shadow stack features using ptrace + * - In the restorer blob: + * - switch to a temporary shadow stack to be able to unmap shadow stack + * with the CRIU mappings + * - after memory mappigns are restored, recreate shadow stack VMAs, + * populate them using wrss instruction and switch to the task shadow + * stack + * - lock shadow stack features + */ +struct rst_shstk_info { + unsigned long vma_start; /* start of shadow stack VMA */ + unsigned long vma_size; /* size of shadow stack VMA */ + unsigned long premmaped_addr; /* address of shadow stack copy in + the premmaped area */ + unsigned long tmp_shstk; /* address of temporary shadow stack */ + u64 ssp; /* shadow stack pointer */ + u64 cet; /* CET conrtol state */ +}; +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_shstk_prepare + +#endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c new file mode 100644 index 0000000000..f6bc81dc68 --- /dev/null +++ b/criu/arch/x86/shstk.c @@ -0,0 +1,90 @@ +#include + +#include + +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +static bool task_needs_shstk(struct pstree_item *item, CoreEntry *core) +{ + UserX86FpregsEntry *fpregs; + + if (!task_alive(item)) + return false; + + fpregs = core->thread_info->fpregs; + if (fpregs->xsave && fpregs->xsave->cet) { + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + pr_warn_once("Restoring task with shadow stack on non-CET machine\n"); + return false; + } + + if (fpregs->xsave->cet->cet & ARCH_SHSTK_SHSTK) + return true; + } + + return false; +} + +static int shstk_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *shstk) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, shstk->ssp)) { + unsigned long premmaped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + shstk->vma_start = vma->e->start; + shstk->vma_size = size; + shstk->premmaped_addr = premmaped_addr; + shstk->tmp_shstk = premmaped_addr + size; + + break; + } + } + + return 0; +} + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + UserX86FpregsEntry *fpregs = core->thread_info->fpregs; + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *shstk = &ta->shstk; + int i; + + if (!task_needs_shstk(item, core)) + return 0; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + fpregs = core->thread_info->fpregs; + shstk = &thread_args->shstk; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + } + + return 0; +} diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 2700497216..e43cc1742b 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -975,6 +975,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (setup_uffd(pid, ta)) return -1; + if (arch_shstk_prepare(current, core, ta)) + return -1; + return sigreturn_restore(pid, ta, args_len, core); } diff --git a/criu/include/restore.h b/criu/include/restore.h index 8ef0dbddf8..7d29496f23 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -7,4 +7,17 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); +struct task_restore_args; +struct pstree_item; + +#ifndef arch_shstk_prepare +static inline int arch_shstk_prepare(struct pstree_item *item, + CoreEntry *core, + struct task_restore_args *ta) +{ + return 0; +} +#define arch_shstk_prepare arch_shstk_prepare +#endif + #endif diff --git a/criu/include/restorer.h b/criu/include/restorer.h index f398d8d8fe..73565d1de4 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -56,6 +56,10 @@ struct restore_posix_timer { int overrun; }; +#ifndef rst_shstk_info +struct rst_shstk_info {}; +#endif + /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things @@ -119,6 +123,8 @@ struct thread_restore_args { unsigned int seccomp_filters_n; bool seccomp_force_tsync; + struct rst_shstk_info shstk; + char comm[TASK_COMM_LEN]; int cg_set; int cgroupd_sk; @@ -240,6 +246,8 @@ struct task_restore_args { uid_t uid; u32 cap_eff[CR_CAP_SIZE]; + + struct rst_shstk_info shstk; } __aligned(64); /* From 9ac6584b134b662d2fe400b43f5a9446a86045e6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 29 Nov 2023 18:55:35 +0200 Subject: [PATCH 223/321] criu: kerndat: add kdat_has_shstk() Detect if CRIU runs with shadow stack enabled and store the result in kerndat. Unlike most kerndat knobs, kdat_has_shstk() does not check for availability of the shadow stack in the kernel, but rather checks if criu runs with shadow stack enabled. This depends on hardware availabilty, kernel and glibc support, compiler options and glibc tunables, so kdat_has_shstk() must be called every time CRIU starts and its result cannot be cached. The result will be used by the code that controls shadow stack enablement in the next commit. Signed-off-by: Mike Rapoport (IBM) --- criu/arch/x86/include/asm/kerndat.h | 1 + criu/arch/x86/kerndat.c | 27 +++++++++++++++++++++++++++ criu/include/kerndat.h | 1 + criu/kerndat.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/criu/arch/x86/include/asm/kerndat.h b/criu/arch/x86/include/asm/kerndat.h index 903bc80f7c..5c37172302 100644 --- a/criu/arch/x86/include/asm/kerndat.h +++ b/criu/arch/x86/include/asm/kerndat.h @@ -4,5 +4,6 @@ extern int kdat_compatible_cr(void); extern int kdat_can_map_vdso(void); extern int kdat_x86_has_ptrace_fpu_xsave_bug(void); +extern int kdat_has_shstk(void); #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c index a98797d39f..3a58bbea7a 100644 --- a/criu/arch/x86/kerndat.c +++ b/criu/arch/x86/kerndat.c @@ -17,6 +17,7 @@ #include "asm/compat.h" #include "asm/dump.h" +#include "asm/shstk.h" int kdat_can_map_vdso(void) { @@ -251,3 +252,29 @@ int kdat_x86_has_ptrace_fpu_xsave_bug(void) return ret; } + +/* + * Unlike most kerndat knobs, this does not check for availability of the + * shadow stack in the kernel, but rather checks if criu runs with shadow + * stack enabled. + * + * This depends on hardware availability, kernel and glibc support, compiler + * options and glibc tunables. + */ +int kdat_has_shstk(void) +{ + unsigned long features; + + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return 0; + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_STATUS, &features)) { + /* kernels that don't support shadow stack return -EINVAL */ + if (errno == EINVAL) + return 0; + pr_perror("Cannot get shadow stack status"); + return 1; + } + + return !!(features & ARCH_SHSTK_SHSTK); +} diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 91dbd494b2..41524ed663 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -87,6 +87,7 @@ struct kerndat_s { bool has_ipv6_freebind; bool has_membarrier_get_registrations; bool has_pagemap_scan; + bool has_shstk; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index e3b378a9c7..6f4fea46b8 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1151,6 +1151,24 @@ static int kerndat_has_openat2(void) return 0; } +int __attribute__((weak)) kdat_has_shstk(void) +{ + return 0; +} + +static int kerndat_has_shstk(void) +{ + int ret = kdat_has_shstk(); + + if (ret < 0) { + pr_err("kdat_has_shstk failed\n"); + return ret; + } + + kdat.has_shstk = !!ret; + return 0; +} + #define KERNDAT_CACHE_NAME "criu.kdat" #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME @@ -1705,6 +1723,12 @@ int kerndat_try_load_new(void) return ret; } + ret = kerndat_has_shstk(); + if (ret < 0) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + return ret; + } + /* New information is found, we need to save to the cache */ if (ret) kerndat_save_cache(); @@ -1926,6 +1950,10 @@ int kerndat_init(void) pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_shstk()) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); From 95c049e21f1319e0481d6d3ff9a82ea5bbe749f2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 31 May 2022 12:45:09 +0300 Subject: [PATCH 224/321] restore: add infrastructure to enable shadow stack There are several gotachs when restoring a task with shadow stack: * depending on the compiler options, glibc version and glibc tunables CRIU can run with or without shadow stack. * shadow stack VMAs are special, they must be created using a dedicated map_shadow_stack() system call and can be modified only by a special instruction (wrss) that is only available when shadow stack is enabled. * once shadow stack is enabled, it is not writable even with wrss; writes to shadow stack can be only enabled with ptrace() and only when shadow stack is enabled in the tracee. * if the shadow stack is enabled during restore rather than by glibc, calling retq after arch_prctl() that enables the shadow stack causes #CP, so the function that enables shadow stack can never return. Add the infrastructure required to cope with all of those: * modify the restore code to allow trampoline (arch_shstk_trampoline) that will enable shadow stack and call restore_task_with_children(). * add call to arch_shstk_unlock() right after the tasks are clone()ed; this will allow unlocking shadow stack features and making shadow stack writable. * add stubs for architectures that do not support shadow stacks * add implementation of arch_shstk_trampoline() and arch_shstk_unlock() for x86, but keep it disabled; it will be enabled along with addtion of the code that will restore shadow stack in the restorer blob Signed-off-by: Mike Rapoport (IBM) --- criu/arch/x86/include/asm/shstk.h | 9 ++ criu/arch/x86/shstk.c | 133 ++++++++++++++++++++++++++++++ criu/cr-restore.c | 14 +++- criu/include/restore.h | 18 ++++ criu/include/rst_info.h | 3 + 5 files changed, 176 insertions(+), 1 deletion(-) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index a81062010c..7849dd7a60 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -66,4 +66,13 @@ int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, struct task_restore_args *ta); #define arch_shstk_prepare arch_shstk_prepare +#if 0 +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); +#define arch_shstk_unlock arch_shstk_unlock + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline +#endif + #endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c index f6bc81dc68..b752f114a8 100644 --- a/criu/arch/x86/shstk.c +++ b/criu/arch/x86/shstk.c @@ -1,3 +1,6 @@ +#include +#include + #include #include @@ -88,3 +91,133 @@ int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, return 0; } + +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid) +{ + unsigned long features; + int status; + int ret = -1; + + /* + * CRIU runs with no shadow stack and the task does not need one, + * nothing to do. + */ + if (!kdat.has_shstk && !task_needs_shstk(item, core)) + return 0; + + futex_wait_until(&rsti(item)->shstk_enable, 1); + + if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { + pr_perror("Cannot attach to %d", pid); + goto futex_wake; + } + + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Cannot interrupt the %d task", pid); + goto detach; + } + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("waitpid(%d) failed", pid); + goto detach; + } + + features = ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS; + if (ptrace(PTRACE_ARCH_PRCTL, pid, features, ARCH_SHSTK_UNLOCK)) { + pr_perror("Cannot unlock CET for %d task", pid); + goto detach; + } + +detach: + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { + pr_perror("Unable to detach %d", pid); + goto futex_wake; + } + + ret = 0; + +futex_wake: + futex_set_and_wake(&rsti(item)->shstk_unlock, 1); + + return ret; +} + +static void shstk_sync_unlock(struct pstree_item *item) +{ + /* notify parent that shadow stack is enabled ... */ + futex_set_and_wake(&rsti(item)->shstk_enable, 1); + + /* ... and wait until it unlocks its features with ptrace */ + futex_wait_until(&rsti(item)->shstk_unlock, 1); +} + +static void __arch_shstk_enable(struct pstree_item *item, + int (*func)(void *arg), void *arg) +{ + int ret; + + shstk_sync_unlock(item); + + /* return here would cause #CP, use exit() instead */ + ret = func(arg); + exit(ret); +} + +static int shstk_disable(struct pstree_item *item) +{ + shstk_sync_unlock(item); + + /* disable shadow stack, implicitly clears ARCH_SHSTK_WRSS */ + if (syscall(__NR_arch_prctl, ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK)) { + pr_perror("Failed to disable shadow stack"); + return -1; + } + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_LOCK, + ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS)) { + pr_perror("Failed to lock shadow stack controls"); + return -1; + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + unsigned long features = ARCH_SHSTK_SHSTK; + int code = ARCH_SHSTK_ENABLE; + + /* + * If task does not need shadow stack but CRIU runs with shadow + * stack enabled, we should disable it before continuing with + * restore + */ + if (!task_needs_shstk(item, core)) { + if (kdat.has_shstk && shstk_disable(item)) + return -1; + return func(arg); + } + + /* + * Calling sys_arch_prctl() means there will be use of retq + * instruction after shadow stack is enabled and this will cause + * Control Protectiond fault. Open code sys_arch_prctl() in + * assembly. + * + * code and addr should be in %rdi and %rsi and will be passed to + * the system call as is. + */ + asm volatile("movq $"__stringify(__NR_arch_prctl)", %%rax \n" + "syscall \n" + "cmpq $0, %%rax \n" + "je 1f \n" + "retq \n" + "1: \n" + :: "D"(code), "S"(features)); + + __arch_shstk_enable(item, func, arg); + + /* never reached */ + return -1; +} diff --git a/criu/cr-restore.c b/criu/cr-restore.c index e43cc1742b..318d34c487 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1498,6 +1498,8 @@ static inline int fork_with_pid(struct pstree_item *item) pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } + arch_shstk_unlock(item, ca.core, pid); + err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) unlock_last_pid(); @@ -1764,7 +1766,7 @@ static int create_children_and_session(void) return 0; } -static int restore_task_with_children(void *_arg) +static int __restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; pid_t pid; @@ -1956,6 +1958,16 @@ static int restore_task_with_children(void *_arg) exit(1); } +static int restore_task_with_children(void *_arg) +{ + struct cr_clone_arg *arg = _arg; + struct pstree_item *item = arg->item; + CoreEntry *core = arg->core; + + return arch_shstk_trampoline(item, core, __restore_task_with_children, + arg); +} + static int attach_to_tasks(bool root_seized) { struct pstree_item *item; diff --git a/criu/include/restore.h b/criu/include/restore.h index 7d29496f23..04d0065051 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -20,4 +20,22 @@ static inline int arch_shstk_prepare(struct pstree_item *item, #define arch_shstk_prepare arch_shstk_prepare #endif +#ifndef arch_shstk_unlock +static inline int arch_shstk_unlock(struct pstree_item *item, + CoreEntry *core, pid_t pid) +{ + return 0; +} +#define arch_shstk_unlock arch_shstk_unlock +#endif + +#ifndef arch_shstk_trampoline +static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + return func(arg); +} +#define arch_shstk_trampoline arch_shstk_trampoline +#endif + #endif diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 704b42a727..59b891fa26 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -75,6 +75,9 @@ struct rst_info { struct rst_rseq *rseqe; + futex_t shstk_enable; + futex_t shstk_unlock; + void *breakpoint; }; From cb39c62f8a2be2a686dbfc60a2730729dd082624 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 25 May 2022 12:30:04 +0300 Subject: [PATCH 225/321] restorer: shstk: implement shadow stack restore The restore of a task with shadow stack enabled adds these steps: * switch from the default shadow stack to a temporary shadow stack allocated in the premmaped area * unmap CRIU mappings; nothing changed here, but it's important that CRIU mappings can be removed only after switching to a temporary shadow stack * create shadow stack VMA with map_shadow_stack() * restore shadow stack contents with wrss * switch to "real" shadow stack * lock shadow stack features Signed-off-by: Mike Rapoport (IBM) --- .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + .../x86/src/lib/include/uapi/asm/sigframe.h | 23 +- criu/arch/x86/include/asm/shstk.h | 204 +++++++++++++++++- criu/include/restorer.h | 16 ++ criu/pie/Makefile | 5 + criu/pie/restorer.c | 29 +++ 6 files changed, 271 insertions(+), 7 deletions(-) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 57681b79a7..4e843bee9e 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -118,3 +118,4 @@ __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_map_shadow_stack 453 sys_map_shadow_stack (unsigned long addr, unsigned long size, unsigned int flags) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index 9a540694b2..4a2e675597 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -177,6 +177,24 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define USER32_CS 0x23 /* clang-format off */ +/* + * rst_sigreturn in resorer is noninline call which adds an entry to the + * shadow stack above the sigframe token; + * if shadow stack is enabled, increment the shadow stack pointer to remove + * that entry + */ +#define ARCH_SHSTK_POP() \ + asm volatile( \ + "xor %%rax, %%rax\n" \ + "rdsspq %%rax\n" \ + "cmpq $0, %%rax\n" \ + "jz 1f\n" \ + "movq $1, %%rax\n" \ + "incsspq %%rax\n" \ + "1:\n" \ + : : \ + : "rax") + #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ "movq %0, %%rax \n" \ @@ -205,9 +223,10 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ do { \ - if ((rt_sigframe)->is_native) \ + if ((rt_sigframe)->is_native) { \ + ARCH_SHSTK_POP(); \ ARCH_RT_SIGRETURN_NATIVE(new_sp); \ - else \ + } else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 7849dd7a60..7814c351d1 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -10,11 +10,11 @@ #endif /* arch/x86/include/uapi/asm/prctl.h */ -#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_ENABLE 0x5001 #define ARCH_SHSTK_DISABLE 0x5002 #define ARCH_SHSTK_LOCK 0x5003 -#define ARCH_SHSTK_UNLOCK 0x5004 -#define ARCH_SHSTK_STATUS 0x5005 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 #define ARCH_SHSTK_SHSTK (1ULL << 0) #define ARCH_SHSTK_WRSS (1ULL << 1) @@ -66,13 +66,207 @@ int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, struct task_restore_args *ta); #define arch_shstk_prepare arch_shstk_prepare -#if 0 int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); #define arch_shstk_unlock arch_shstk_unlock int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, int (*func)(void *arg), void *arg); #define arch_shstk_trampoline arch_shstk_trampoline -#endif + +#ifdef CR_NOGLIBC + +#include +#include +#include "vma.h" + +#define SHSTK_BUSY_BIT (1UL << 0) /* BIT(0) */ + +static inline int shstk_map(unsigned long addr, unsigned long size) +{ + long shstk = sys_map_shadow_stack(addr, size, SHADOW_STACK_SET_TOKEN); + + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", addr, shstk); + return -1; + } + + if (shstk != addr) { + pr_err("Shadow stack address mismatch: need %lx, got %lx\n", addr, shstk); + return -1; + } + + pr_info("Created shadow stack at %lx\n", shstk); + + return 0; +} + +/* clang-format off */ +static inline unsigned long get_ssp(void) +{ + unsigned long ssp; + + asm volatile("rdsspq %0" : "=r"(ssp) :: ); + + return ssp; +} + +static inline void wrssq(unsigned long addr, unsigned long val) +{ + asm volatile("wrssq %1, (%0)" :: "r"(addr), "r"(val) : "memory"); +} +/* clang-format off */ + +static always_inline void shstk_switch_ssp(unsigned long new_ssp) +{ + unsigned long old_ssp = get_ssp(); + + asm volatile("rstorssp (%0)\n" :: "r"(new_ssp)); + asm volatile("saveprevssp"); + + pr_debug("changed ssp from %lx to %lx\n", old_ssp, new_ssp); +} + +/* + * Disable writes to the shadow stack and lock it's disable/enable control + */ +static inline int shstk_finalize(void) +{ + int ret = 0; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return ret; +} + +/* + * Restore contents of the shadow stack and set shadow stack pointer + */ +static always_inline int shstk_restore(struct rst_shstk_info *cet) +{ + unsigned long *shstk_data = (unsigned long *)cet->premmaped_addr; + unsigned long ssp = cet->vma_start + cet->vma_size - 8; + unsigned long shstk_top = cet->vma_size / 8 - 1; + unsigned long val; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + if (shstk_map(cet->vma_start, cet->vma_size)) + return -1; + + /* + * Switch shadow stack from temporary location to the actual task's + * shadow stack VMA + */ + shstk_switch_ssp(ssp); + + /* restore shadow stack contents */ + for (; ssp >= cet->ssp; ssp -= 8, shstk_top--) + wrssq(ssp, shstk_data[shstk_top]); + + /* + * Add tokens for sigreturn frame and for switch of the shadow stack. + * The sigreturn token will be checked by the kernel during + * processing of sigreturn + * The token for stack switch is required by rstorssp and + * saveprevssp semantics + */ + + /* token for sigreturn frame */ + val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; + wrssq(ssp, val); + + /* shadow stack switch token */ + val = ssp | SHSTK_BUSY_BIT; + ssp -= 8; + wrssq(ssp, val); + + /* reset shadow stack pointer to the proper location */ + shstk_switch_ssp(ssp); + + ret = sys_munmap(shstk_data, cet->vma_size + PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + return shstk_finalize(); +} +#define arch_shstk_restore shstk_restore + +/* + * Disable shadow stack + */ +static inline int shstk_disable(void) +{ + int ret; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); + if (ret) { + pr_err("Failed to disable shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return 0; +} + +/* + * Switch to temporary shadow stack + */ +static always_inline int shstk_switch_to_restorer(struct rst_shstk_info *cet) +{ + unsigned long ssp; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + ret = sys_munmap((void *)cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap area for temporary shadow stack\n"); + return -1; + } + + ret = shstk_map(cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) + return -1; + + /* + * Switch shadow stack from the default created by the kernel to a + * temporary shadow stack allocated in the premmaped area + */ + ssp = cet->tmp_shstk + PAGE_SIZE - 8; + shstk_switch_ssp(ssp); + + ret = sys_arch_prctl(ARCH_SHSTK_ENABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to enable writes to shadow stack\n"); + return ret; + } + + return 0; +} +#define arch_shstk_switch_to_restorer shstk_switch_to_restorer + +#endif /* CR_NOGLIBC */ #endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 73565d1de4..3fb5322a4b 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -339,4 +339,20 @@ enum { #define __r_sym(name) restorer_sym##name #define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name)) +#ifndef arch_shstk_switch_to_restorer +static inline int arch_shstk_switch_to_restorer(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_switch_to_restorer arch_shstk_switch_to_restorer +#endif + +#ifndef arch_shstk_restore +static inline int arch_shstk_restore(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_restore arch_shstk_restore +#endif + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 265dcf82bd..912fab24ba 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -18,6 +18,11 @@ ifeq ($(ARCH),mips) ccflags-y += -mno-abicalls -fno-pic endif +# -mshstk required for CET instructions +ifeq ($(ARCH),x86) + ccflags-y += -mshstk +endif + LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 20c6801c5e..7c34c06d47 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -752,6 +752,10 @@ __visible long __export_restore_thread(struct thread_restore_args *args) goto core_restore_end; } + /* restore original shadow stack */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + /* All signals must be handled by thread leader */ ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); @@ -1672,6 +1676,9 @@ __visible long __export_restore_task(struct task_restore_args *args) pr_debug("lazy-pages: uffd %d\n", args->uffd); } + if (arch_shstk_switch_to_restorer(&args->shstk)) + goto core_restore_end; + /* * Park vdso/vvar in a safe place if architecture doesn't support * mapping them with arch_prctl(). @@ -1723,6 +1730,13 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start > vma_entry->shmid) break; + /* + * shadow stack VMAs cannot be remapped, they must be + * recreated with map_shadow_stack system call + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) + continue; + if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1740,6 +1754,13 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start < vma_entry->shmid) break; + /* + * shadow stack VMAs cannot be remapped, they must be + * recreated with map_shadow_stack system call + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) + continue; + if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -2166,6 +2187,14 @@ __visible long __export_restore_task(struct task_restore_args *args) futex_set_and_wake(&thread_inprogress, args->nr_threads); + /* + * Shadow stack of the leader can be locked only after all other + * threads were cloned, otherwise they may start with read-only + * shadow stack. + */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); if (ret) From f7b2e63e09f219268c8bf2fe13725f678a925fa0 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 24 Mar 2024 11:07:07 +0000 Subject: [PATCH 226/321] ci: try to fix broken docker test Upgrade to 22.04 base image and use the existing version of docker. Signed-off-by: Adrian Reber --- .github/workflows/docker-test.yml | 2 +- scripts/ci/docker-test.sh | 19 ------------------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index fabf399fd3..11d67432ba 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -12,7 +12,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04] + os: [ubuntu-22.04] steps: - uses: actions/checkout@v2 - name: Run Docker Test (${{ matrix.os }}) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index 7e7ef71973..aaf443afdc 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,25 +2,6 @@ set -x -e -o pipefail -./apt-install \ - apt-transport-https \ - ca-certificates \ - curl \ - software-properties-common - -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - -add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable test" - -# checkpoint/restore is broken in Docker Engine (Community) version 25.0.0-beta.1 -# https://github.com/moby/moby/discussions/46816 -# Downgrade to the latest stable version. -VERSION_STRING=5:24.0.7-1~ubuntu.20.04~focal -./apt-install docker-ce=$VERSION_STRING docker-ce-cli=$VERSION_STRING containerd.io docker-buildx-plugin docker-compose-plugin - # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json service docker restart From 06c101654f3db771d1eb657944b128dfc3530cb0 Mon Sep 17 00:00:00 2001 From: Artem Trushkin Date: Sun, 24 Mar 2024 17:16:58 +0700 Subject: [PATCH 227/321] mem: fix some VMAs being incorrectly mapped wtih PROT_WRITE A memory interval is a half-open interval, so the condition when pr->pe->vaddr == vma->e->end should not be interpreted as an intersection and should cause vma to be marked with VMA_NO_PROT_WRITE. Fixes: #2364 Signed-off-by: Artem Trushkin --- criu/mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/mem.c b/criu/mem.c index 0236c5e1e9..5f0d57eb66 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1057,7 +1057,7 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo do { if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) continue; - if (pr->pe->vaddr > vma->e->end) + if (pr->pe->vaddr >= vma->e->end) vma->e->status |= VMA_NO_PROT_WRITE; break; } while (pr->advance(pr)); From 6ee6be5de76f6a3cd865ce0493ab55e9171e19f2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 4 Mar 2024 15:10:31 +0000 Subject: [PATCH 228/321] Add support for reset-on-fork scheduling flag This patch extends CRIU with support for SCHED_RESET_ON_FORK. When the SCHED_RESET_ON_FORK flag is set, the following rules apply for subsequently created children: - If the calling thread has a scheduling policy of SCHED_FIFO or SCHED_RR, the policy is reset to SCHED_OTHER in child processes. - If the calling process has a negative nice value, the nice value is reset to zero in child processes. (See 'man 7 sched') Fixes: #2359 Signed-off-by: Radostin Stoyanov --- criu/cr-dump.c | 5 +++++ criu/cr-restore.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index ee5974acc9..fe5e73798c 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -157,6 +157,11 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) tc->has_sched_policy = true; tc->sched_policy = ret; + /* The reset-on-fork flag might be used in combination + * with SCHED_FIFO or SCHED_RR to reset the scheduling + * policy/priority in child processes. + */ + ret &= ~SCHED_RESET_ON_FORK; if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { ret = syscall(__NR_sched_getparam, pid, &sp); if (ret < 0) { diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 318d34c487..874986ca06 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3057,7 +3057,7 @@ static int validate_sched_parm(struct rst_sched_param *sp) if ((sp->nice < -20) || (sp->nice > 19)) return 0; - switch (sp->policy) { + switch (sp->policy & ~SCHED_RESET_ON_FORK) { case SCHED_RR: case SCHED_FIFO: return ((sp->prio > 0) && (sp->prio < 100)); From 2355a2a13d2b8b5b667a6d0a34a7561973339fdd Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 5 Mar 2024 08:53:56 +0000 Subject: [PATCH 229/321] zdtm/sched_policy00: use reset-on-fork flag This patch extends the sched_policy00 test case to verify that the SCHED_RESET_ON_FORK flag is restored correctly. Signed-off-by: Radostin Stoyanov --- test/zdtm/static/sched_policy00.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/zdtm/static/sched_policy00.c b/test/zdtm/static/sched_policy00.c index dc71eed940..a351350503 100644 --- a/test/zdtm/static/sched_policy00.c +++ b/test/zdtm/static/sched_policy00.c @@ -51,7 +51,7 @@ int main(int argc, char **argv) } p.sched_priority = param; - if (sched_setscheduler(pid, SCHED_RR, &p)) { + if (sched_setscheduler(pid, SCHED_RR | SCHED_RESET_ON_FORK, &p)) { pr_perror("Can't set policy"); kill(pid, SIGKILL); return -1; @@ -61,7 +61,7 @@ int main(int argc, char **argv) test_waitsig(); ret = sched_getscheduler(pid); - if (ret != SCHED_RR) { + if (ret != (SCHED_RR | SCHED_RESET_ON_FORK)) { fail("Broken/No policy"); err++; } From 61828769a2d253a836f116413912176fda29078b Mon Sep 17 00:00:00 2001 From: ccccrrrr Date: Thu, 29 Feb 2024 16:02:06 +0800 Subject: [PATCH 230/321] criu: move timers dump/restore code into separate file Fixes: #335 Signed-off-by: ccccrrrr --- criu/Makefile.crtools | 1 + criu/cr-dump.c | 1 + criu/cr-restore.c | 242 +------------------ criu/include/parasite-syscall.h | 5 - criu/include/timer.h | 17 ++ criu/parasite-syscall.c | 150 ------------ criu/timer.c | 399 ++++++++++++++++++++++++++++++++ 7 files changed, 419 insertions(+), 396 deletions(-) create mode 100644 criu/include/timer.h create mode 100644 criu/timer.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index f586449172..bf17f1ec9d 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -92,6 +92,7 @@ obj-y += servicefd.o obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o +obj-y += timer.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/cr-dump.c b/criu/cr-dump.c index fe5e73798c..a29ec82eff 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -86,6 +86,7 @@ #include "pidfd-store.h" #include "apparmor.h" #include "asm/dump.h" +#include "timer.h" /* * Architectures can overwrite this function to restore register sets that diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 874986ca06..c19a20b46f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -98,6 +98,7 @@ #include "restore.h" #include "cr-errno.h" +#include "timer.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -118,7 +119,6 @@ static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); /* @@ -882,7 +882,6 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a return 0; } -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); static int prepare_mm(pid_t pid, struct task_restore_args *args); static int restore_one_alive_task(int pid, CoreEntry *core) @@ -2719,245 +2718,6 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he return -1; } -static inline int timeval_valid(struct timeval *tv) -{ - return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); -} - -static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) -{ - if (ie->isec == 0 && ie->iusec == 0) { - memzero_p(val); - return 0; - } - - val->it_interval.tv_sec = ie->isec; - val->it_interval.tv_usec = ie->iusec; - - if (!timeval_valid(&val->it_interval)) { - pr_err("Invalid timer interval\n"); - return -1; - } - - if (ie->vsec == 0 && ie->vusec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - val->it_value.tv_sec = ie->isec; - val->it_value.tv_usec = ie->iusec; - } else { - val->it_value.tv_sec = ie->vsec; - val->it_value.tv_usec = ie->vusec; - } - - if (!timeval_valid(&val->it_value)) { - pr_err("Invalid timer value\n"); - return -1; - } - - pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, - val->it_interval.tv_sec, val->it_interval.tv_usec); - - return 0; -} - -/* - * Legacy itimers restore from CR_FD_ITIMERS - */ - -static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) -{ - int ret = -1; - struct cr_img *img; - ItimerEntry *ie; - - if (!deprecated_ok("Itimers")) - return -1; - - img = open_image(CR_FD_ITIMERS, O_RSTR, pid); - if (!img) - return -1; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("real", ie, &args->itimers[0]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("virt", ie, &args->itimers[1]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("prof", ie, &args->itimers[2]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; -out: - close_image(img); - return ret; -} - -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) -{ - int ret = 0; - TaskTimersEntry *tte = core->tc->timers; - - if (!tte) - return prepare_itimers_from_fd(pid, args); - - ret |= decode_itimer("real", tte->real, &args->itimers[0]); - ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); - ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); - - return ret; -} - -static inline int timespec_valid(struct timespec *ts) -{ - return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); -} - -static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) -{ - pt->val.it_interval.tv_sec = pte->isec; - pt->val.it_interval.tv_nsec = pte->insec; - - if (!timespec_valid(&pt->val.it_interval)) { - pr_err("Invalid timer interval(posix)\n"); - return -1; - } - - if (pte->vsec == 0 && pte->vnsec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - pt->val.it_value.tv_sec = pte->isec; - pt->val.it_value.tv_nsec = pte->insec; - } else { - pt->val.it_value.tv_sec = pte->vsec; - pt->val.it_value.tv_nsec = pte->vnsec; - } - - if (!timespec_valid(&pt->val.it_value)) { - pr_err("Invalid timer value(posix)\n"); - return -1; - } - - pt->spt.it_id = pte->it_id; - pt->spt.clock_id = pte->clock_id; - pt->spt.si_signo = pte->si_signo; - pt->spt.it_sigev_notify = pte->it_sigev_notify; - pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); - pt->spt.notify_thread_id = pte->notify_thread_id; - pt->overrun = pte->overrun; - - return 0; -} - -static int cmp_posix_timer_proc_id(const void *p1, const void *p2) -{ - return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; -} - -static void sort_posix_timers(struct task_restore_args *ta) -{ - void *tmem; - - /* - * This is required for restorer's create_posix_timers(), - * it will probe them one-by-one for the desired ID, since - * kernel doesn't provide another API for timer creation - * with given ID. - */ - - if (ta->posix_timers_n > 0) { - tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); - qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); - } -} - -/* - * Legacy posix timers restoration from CR_FD_POSIX_TIMERS - */ - -static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) -{ - struct cr_img *img; - int ret = -1; - struct restore_posix_timer *t; - - if (!deprecated_ok("Posix timers")) - return -1; - - img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); - if (!img) - return -1; - - ta->posix_timers_n = 0; - while (1) { - PosixTimerEntry *pte; - - ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); - if (ret <= 0) - break; - - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - break; - - ret = decode_posix_timer(pte, t); - if (ret < 0) - break; - - posix_timer_entry__free_unpacked(pte, NULL); - ta->posix_timers_n++; - } - - close_image(img); - if (!ret) - sort_posix_timers(ta); - - return ret; -} - -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) -{ - int i, ret = -1; - TaskTimersEntry *tte = core->tc->timers; - struct restore_posix_timer *t; - - ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); - - if (!tte) - return prepare_posix_timers_from_fd(pid, ta); - - ta->posix_timers_n = tte->n_posix; - for (i = 0; i < ta->posix_timers_n; i++) { - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - goto out; - - if (decode_posix_timer(tte->posix[i], t)) - goto out; - } - - ret = 0; - sort_posix_timers(ta); -out: - return ret; -} - static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index 4540e11ee7..70ecbb720f 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -22,11 +22,6 @@ struct parasite_ctl; struct parasite_thread_ctl; extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); -extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); - -struct proc_posix_timers_stat; -extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *); extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); diff --git a/criu/include/timer.h b/criu/include/timer.h new file mode 100644 index 0000000000..09583a9019 --- /dev/null +++ b/criu/include/timer.h @@ -0,0 +1,17 @@ +#ifndef __CR_TIMER_H__ +#define __CR_TIMER_H__ + +#include "images/core.pb-c.h" + +struct task_restore_args; +struct pstree_item; +struct parasite_ctl; +struct proc_posix_timers_stat; + +extern int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); +extern int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); + +extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item); +extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item); +#endif \ No newline at end of file diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 295e404ec5..6d2aa9c887 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -278,156 +278,6 @@ int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *i return 0; } -static void encode_itimer(struct itimerval *v, ItimerEntry *ie) -{ - ie->isec = v->it_interval.tv_sec; - ie->iusec = v->it_interval.tv_usec; - ie->vsec = v->it_value.tv_sec; - ie->vusec = v->it_value.tv_usec; -} - -int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - struct parasite_dump_itimers_args *args; - int ret; - - args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); - if (ret < 0) - return ret; - - encode_itimer((&args->real), (core->tc->timers->real)); - encode_itimer((&args->virt), (core->tc->timers->virt)); - encode_itimer((&args->prof), (core->tc->timers->prof)); - - return 0; -} - -static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) -{ - int sz; - - /* - * Will be free()-ed in core_entry_free() - */ - - sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); - tte->posix = xmalloc(sz); - if (!tte->posix) - return -1; - - tte->n_posix = n; - *pte = (PosixTimerEntry *)(tte->posix + n); - return 0; -} - -static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) -{ - pid_t vtid = 0; - int i; - - if (rtid == 0) - return 0; - - if (!(root_ns_mask & CLONE_NEWPID)) { - /* Non-pid-namespace case */ - pte->notify_thread_id = rtid; - pte->has_notify_thread_id = true; - return 0; - } - - /* Pid-namespace case */ - if (!kdat.has_nspid) { - pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); - return -1; - } - - for (i = 0; i < item->nr_threads; i++) { - if (item->threads[i].real != rtid) - continue; - - vtid = item->threads[i].ns[0].virt; - break; - } - - if (vtid == 0) { - pr_err("Unable to convert the notify thread id %d\n", rtid); - return -1; - } - - pte->notify_thread_id = vtid; - pte->has_notify_thread_id = true; - return 0; -} - -static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, - PosixTimerEntry *pte) -{ - pte->it_id = vp->spt.it_id; - pte->clock_id = vp->spt.clock_id; - pte->si_signo = vp->spt.si_signo; - pte->it_sigev_notify = vp->spt.it_sigev_notify; - pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); - - pte->overrun = v->overrun; - - pte->isec = v->val.it_interval.tv_sec; - pte->insec = v->val.it_interval.tv_nsec; - pte->vsec = v->val.it_value.tv_sec; - pte->vnsec = v->val.it_value.tv_nsec; - - if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) - return -1; - - return 0; -} - -int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - TaskTimersEntry *tte = core->tc->timers; - PosixTimerEntry *pte; - struct proc_posix_timer *temp; - struct parasite_dump_posix_timers_args *args; - int ret, exit_code = -1; - int args_size; - int i; - - if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) - return -1; - - args_size = posix_timers_dump_size(proc_args->timer_n); - args = compel_parasite_args_s(ctl, args_size); - args->timer_n = proc_args->timer_n; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - args->timer[i].it_id = temp->spt.it_id; - i++; - } - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); - if (ret < 0) - goto end_posix; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - posix_timer_entry__init(&pte[i]); - if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) - goto end_posix; - tte->posix[i] = &pte[i]; - i++; - } - - exit_code = 0; -end_posix: - free_posix_timers(proc_args); - return exit_code; -} - int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) { struct parasite_dump_misc *ma; diff --git a/criu/timer.c b/criu/timer.c new file mode 100644 index 0000000000..bdcb059cce --- /dev/null +++ b/criu/timer.c @@ -0,0 +1,399 @@ +#include "types.h" +#include "crtools.h" +#include "infect.h" +#include "protobuf.h" +#include "pstree.h" +#include "posix-timer.h" +#include "parasite.h" +#include "namespaces.h" +#include "rst-malloc.h" +#include "restorer.h" + +static inline int timeval_valid(struct timeval *tv) +{ + return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); +} + +static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) +{ + if (ie->isec == 0 && ie->iusec == 0) { + memzero_p(val); + return 0; + } + + val->it_interval.tv_sec = ie->isec; + val->it_interval.tv_usec = ie->iusec; + + if (!timeval_valid(&val->it_interval)) { + pr_err("Invalid timer interval\n"); + return -1; + } + + if (ie->vsec == 0 && ie->vusec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + val->it_value.tv_sec = ie->isec; + val->it_value.tv_usec = ie->iusec; + } else { + val->it_value.tv_sec = ie->vsec; + val->it_value.tv_usec = ie->vusec; + } + + if (!timeval_valid(&val->it_value)) { + pr_err("Invalid timer value\n"); + return -1; + } + + pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, + val->it_interval.tv_sec, val->it_interval.tv_usec); + + return 0; +} + +/* + * Legacy itimers restore from CR_FD_ITIMERS + */ + +int prepare_itimers_from_fd(int pid, struct task_restore_args *args) +{ + int ret = -1; + struct cr_img *img; + ItimerEntry *ie; + + if (!deprecated_ok("Itimers")) + return -1; + + img = open_image(CR_FD_ITIMERS, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("real", ie, &args->itimers[0]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("virt", ie, &args->itimers[1]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("prof", ie, &args->itimers[2]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; +out: + close_image(img); + return ret; +} + +int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) +{ + int ret = 0; + TaskTimersEntry *tte = core->tc->timers; + + if (!tte) + return prepare_itimers_from_fd(pid, args); + + ret |= decode_itimer("real", tte->real, &args->itimers[0]); + ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); + ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); + + return ret; +} + +static inline int timespec_valid(struct timespec *ts) +{ + return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); +} + +static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) +{ + pt->val.it_interval.tv_sec = pte->isec; + pt->val.it_interval.tv_nsec = pte->insec; + + if (!timespec_valid(&pt->val.it_interval)) { + pr_err("Invalid timer interval(posix)\n"); + return -1; + } + + if (pte->vsec == 0 && pte->vnsec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + pt->val.it_value.tv_sec = pte->isec; + pt->val.it_value.tv_nsec = pte->insec; + } else { + pt->val.it_value.tv_sec = pte->vsec; + pt->val.it_value.tv_nsec = pte->vnsec; + } + + if (!timespec_valid(&pt->val.it_value)) { + pr_err("Invalid timer value(posix)\n"); + return -1; + } + + pt->spt.it_id = pte->it_id; + pt->spt.clock_id = pte->clock_id; + pt->spt.si_signo = pte->si_signo; + pt->spt.it_sigev_notify = pte->it_sigev_notify; + pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); + pt->spt.notify_thread_id = pte->notify_thread_id; + pt->overrun = pte->overrun; + + return 0; +} + +static int cmp_posix_timer_proc_id(const void *p1, const void *p2) +{ + return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; +} + +static void sort_posix_timers(struct task_restore_args *ta) +{ + void *tmem; + + /* + * This is required for restorer's create_posix_timers(), + * it will probe them one-by-one for the desired ID, since + * kernel doesn't provide another API for timer creation + * with given ID. + */ + + if (ta->posix_timers_n > 0) { + tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); + qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); + } +} + +/* + * Legacy posix timers restoration from CR_FD_POSIX_TIMERS + */ + +int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) +{ + struct cr_img *img; + int ret = -1; + struct restore_posix_timer *t; + + if (!deprecated_ok("Posix timers")) + return -1; + + img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); + if (!img) + return -1; + + ta->posix_timers_n = 0; + while (1) { + PosixTimerEntry *pte; + + ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); + if (ret <= 0) + break; + + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + break; + + ret = decode_posix_timer(pte, t); + if (ret < 0) + break; + + posix_timer_entry__free_unpacked(pte, NULL); + ta->posix_timers_n++; + } + + close_image(img); + if (!ret) + sort_posix_timers(ta); + + return ret; +} + +int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) +{ + int i, ret = -1; + TaskTimersEntry *tte = core->tc->timers; + struct restore_posix_timer *t; + + ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); + + if (!tte) + return prepare_posix_timers_from_fd(pid, ta); + + ta->posix_timers_n = tte->n_posix; + for (i = 0; i < ta->posix_timers_n; i++) { + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + goto out; + + if (decode_posix_timer(tte->posix[i], t)) + goto out; + } + + ret = 0; + sort_posix_timers(ta); +out: + return ret; +} + +static void encode_itimer(struct itimerval *v, ItimerEntry *ie) +{ + ie->isec = v->it_interval.tv_sec; + ie->iusec = v->it_interval.tv_usec; + ie->vsec = v->it_value.tv_sec; + ie->vusec = v->it_value.tv_usec; +} + +int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + struct parasite_dump_itimers_args *args; + int ret; + + args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); + if (ret < 0) + return ret; + + encode_itimer((&args->real), (core->tc->timers->real)); + encode_itimer((&args->virt), (core->tc->timers->virt)); + encode_itimer((&args->prof), (core->tc->timers->prof)); + + return 0; +} + +static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) +{ + int sz; + + /* + * Will be free()-ed in core_entry_free() + */ + + sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); + tte->posix = xmalloc(sz); + if (!tte->posix) + return -1; + + tte->n_posix = n; + *pte = (PosixTimerEntry *)(tte->posix + n); + return 0; +} + +static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) +{ + pid_t vtid = 0; + int i; + + if (rtid == 0) + return 0; + + if (!(root_ns_mask & CLONE_NEWPID)) { + /* Non-pid-namespace case */ + pte->notify_thread_id = rtid; + pte->has_notify_thread_id = true; + return 0; + } + + /* Pid-namespace case */ + if (!kdat.has_nspid) { + pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + if (item->threads[i].real != rtid) + continue; + + vtid = item->threads[i].ns[0].virt; + break; + } + + if (vtid == 0) { + pr_err("Unable to convert the notify thread id %d\n", rtid); + return -1; + } + + pte->notify_thread_id = vtid; + pte->has_notify_thread_id = true; + return 0; +} + +static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, + PosixTimerEntry *pte) +{ + pte->it_id = vp->spt.it_id; + pte->clock_id = vp->spt.clock_id; + pte->si_signo = vp->spt.si_signo; + pte->it_sigev_notify = vp->spt.it_sigev_notify; + pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); + + pte->overrun = v->overrun; + + pte->isec = v->val.it_interval.tv_sec; + pte->insec = v->val.it_interval.tv_nsec; + pte->vsec = v->val.it_value.tv_sec; + pte->vnsec = v->val.it_value.tv_nsec; + + if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) + return -1; + + return 0; +} + +int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + TaskTimersEntry *tte = core->tc->timers; + PosixTimerEntry *pte; + struct proc_posix_timer *temp; + struct parasite_dump_posix_timers_args *args; + int ret, exit_code = -1; + int args_size; + int i; + + if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) + return -1; + + args_size = posix_timers_dump_size(proc_args->timer_n); + args = compel_parasite_args_s(ctl, args_size); + args->timer_n = proc_args->timer_n; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + args->timer[i].it_id = temp->spt.it_id; + i++; + } + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); + if (ret < 0) + goto end_posix; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + posix_timer_entry__init(&pte[i]); + if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) + goto end_posix; + tte->posix[i] = &pte[i]; + i++; + } + + exit_code = 0; +end_posix: + free_posix_timers(proc_args); + return exit_code; +} From bec56d6a436770eb94bcd4e8b28c973005b14e78 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 24 Mar 2024 14:26:41 +0000 Subject: [PATCH 231/321] ci: silence CircleCI warning about deprecated image CircleCI currently prints out the following warning: This job is using a deprecated image 'ubuntu-2004:202010-01', please update to a newer image According to https://discuss.circleci.com/t/linux-image-deprecations-and-eol-for-2024/ the recommended image name is: "image: default" Signed-off-by: Adrian Reber --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 47f7ad9b18..785b383e10 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2.1 jobs: test-local-gcc: machine: - image: ubuntu-2004:202010-01 + image: default working_directory: ~/criu steps: - checkout @@ -11,7 +11,7 @@ jobs: command: sudo -E make -C scripts/ci local test-local-clang: machine: - image: ubuntu-2004:202010-01 + image: default working_directory: ~/criu steps: - checkout From 00d7cdc452294c2e951dac04ff2aff2a5b528d96 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 27 Mar 2024 13:34:26 +0800 Subject: [PATCH 232/321] timer: fix wrapping allignment in function declaration Currently we have tabs + spaces on the wrapped line but the wrapped part is not alligned to the opening bracket. Fixes: bbe26d1b7 ("timer: fix allignment in function definition") Signed-off-by: Pavel Tikhomirov --- criu/include/timer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/include/timer.h b/criu/include/timer.h index 09583a9019..d1deb6051d 100644 --- a/criu/include/timer.h +++ b/criu/include/timer.h @@ -13,5 +13,5 @@ extern int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item); extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *item); -#endif \ No newline at end of file + struct pstree_item *item); +#endif From 9a282a5b9fb2e0e1eaa128cb1602a53623a87838 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 5 Apr 2024 14:19:28 -0700 Subject: [PATCH 233/321] Makefile.config: fix/improve feature warnings. 1. Tell which RPMs or DEBs are required in all cases. 2. Use $(info ...) everywhere. 3. Drop extra nested $(info), instead use (a document) a simpler kludge. 4. Simplify and unify the language, add missing periods. Signed-off-by: Kir Kolyshkin --- Makefile.config | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/Makefile.config b/Makefile.config index 8f2b5208e0..52c250b21c 100644 --- a/Makefile.config +++ b/Makefile.config @@ -2,12 +2,15 @@ include $(__nmk_dir)utils.mk include $(__nmk_dir)msg.mk include scripts/feature-tests.mak +# This is a kludge for $(info ...) to not eat spaces. +S := + ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else $(info Note: Building without setproctitle() and strlcpy() support.) - $(info $(info) To enable these features, please install libbsd-devel (RPM) / libbsd-dev (DEB).) + $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libselinux),y) @@ -23,10 +26,10 @@ endif ifeq ($(call pkg-config-check,libdrm),y) export CONFIG_AMDGPU := y - $(info Note: Building criu with amdgpu_plugin.) + $(info Note: Building with amdgpu_plugin.) else - $(info Note: Building criu without amdgpu_plugin.) - $(info Note: libdrm and libdrm_amdgpu are required to build amdgpu_plugin.) + $(info Note: Building without amdgpu_plugin.) + $(info $S Install libdrm-devel (RPM) or libdrm-dev (DEB) to fix.) endif ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) @@ -34,7 +37,8 @@ ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) export CONFIG_GNUTLS := y FEATURE_DEFINES += -DCONFIG_GNUTLS else - $(info Note: Building without GnuTLS support) + $(info Note: Building without GnuTLS support.) + $(info $S Install gnutls-devel (RPM) or gnutls-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libnftables),y) @@ -46,12 +50,11 @@ ifeq ($(call pkg-config-check,libnftables),y) LIBS_FEATURES += $(LIB_NFTABLES) FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_1 else - $(warning Warn: you have libnftables installed but it has incompatible API) - $(warning Warn: Building without nftables support) + $(info Warn: Building without nftables support (incompatible API version).) endif else - $(warning Warn: you have no libnftables installed) - $(warning Warn: Building without nftables support) + $(info Warn: Building without nftables support.) + $(info $S Install nftables-devel (RPM) or libnftables-dev (DEB) to fix.) endif export LIBS += $(LIBS_FEATURES) @@ -67,10 +70,10 @@ ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y FEATURE_DEFINES += -DCONFIG_COMPAT else - $(info Note: Building without ia32 C/R, missed ia32 support in gcc) - $(info $(info) That may be related to missing gcc-multilib in your) - $(info $(info) distribution or you may have Debian with buggy toolchain) - $(info $(info) (issue https://github.com/checkpoint-restore/criu/issues/315)) + $(info Note: Building without ia32 C/R, missing ia32 support in gcc.) + $(info $S It may be related to missing gcc-multilib in your) + $(info $S distribution, or you may have Debian with buggy toolchain.) + $(info $S See https://github.com/checkpoint-restore/criu/issues/315.) endif endif From 7aa8ec44244615df5d557f24f8f041d5b4690ac2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 20 Dec 2023 15:22:48 -0800 Subject: [PATCH 234/321] check: verify ino and dev of overlayfs files in /proc/pid/maps Check that the file device and inode shown in /proc/pid/maps match values returned by stat(2). Signed-off-by: Andrei Vagin --- criu/cr-check.c | 194 +++++++++++++++++++++++++++++++++++++ scripts/ci/run-ci-tests.sh | 14 ++- 2 files changed, 204 insertions(+), 4 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index fea1ce674a..507f9915ca 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "../soccr/soccr.h" @@ -53,6 +54,8 @@ #include "restorer.h" #include "uffd.h" #include "linux/aio_abi.h" +#include "syscall.h" +#include "mount-v2.h" #include "images/inventory.pb-c.h" @@ -1390,6 +1393,195 @@ static int check_pagemap_scan(void) return 0; } +/* musl doesn't have a statx wrapper... */ +struct staty { + __u32 stx_dev_major; + __u32 stx_dev_minor; + __u64 stx_ino; +}; + +static long get_file_dev_and_inode(void *addr, struct staty *stx) +{ + char buf[4096]; + FILE *mapf; + + mapf = fopen("/proc/self/maps", "r"); + if (mapf == NULL) { + pr_perror("fopen(/proc/self/maps)"); + return -1; + } + + while (fgets(buf, sizeof(buf), mapf)) { + unsigned long start, end; + uint32_t maj, min; + __u64 ino; + + if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu", + &start, &end, &maj, &min, &ino) != 5) { + pr_perror("Unable to parse: %s", buf); + return -1; + } + if (start == (unsigned long)addr) { + stx->stx_dev_major = maj; + stx->stx_dev_minor = min; + stx->stx_ino = ino; + return 0; + } + } + + pr_err("Unable to find the mapping\n"); + return -1; +} + +static int ovl_mount(void) +{ + int tmpfs, fsfd, ovl; + + fsfd = sys_fsopen("tmpfs", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen tmpfs"); + return -1; + } + + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create tmpfs mount"); + return -1; + } + + tmpfs = sys_fsmount(fsfd, 0, 0); + if (tmpfs == -1) { + pr_perror("Unable to mount tmpfs"); + return -1; + } + + close(fsfd); + + /* overlayfs can't be constructed on top of a detached mount. */ + if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Unable to attach tmpfs mount"); + return -1; + } + close(tmpfs); + + if (chdir("/tmp")) { + pr_perror("Unable to change working directory"); + return -1; + } + + if (mkdir("/tmp/w", 0755) == -1 || + mkdir("/tmp/u", 0755) == -1 || + mkdir("/tmp/l", 0755) == -1) { + pr_perror("mkdir"); + return -1; + } + + fsfd = sys_fsopen("overlay", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen overlayfs"); + return -1; + } + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { + pr_perror("Unable to configure overlayfs"); + return -1; + } + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create overlayfs"); + return -1; + } + ovl = sys_fsmount(fsfd, 0, 0); + if (ovl == -1) { + pr_perror("Unable to mount overlayfs"); + return -1; + } + + return ovl; +} + +/* + * Check that the file device and inode shown in /proc/pid/maps match values + * returned by stat(2). + */ +static int do_check_overlayfs_maps(void) +{ + struct staty stx, mstx; + struct stat st; + int ovl, fd; + void *addr; + + /* Create a new mount namespace to not care about cleaning test mounts. */ + if (unshare(CLONE_NEWNS) == -1) { + pr_warn("Unable to create a new mount namespace\n"); + return 0; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + pr_perror("Unable to remount / with MS_SLAVE"); + return -1; + } + + ovl = ovl_mount(); + if (ovl == -1) + return -1; + + fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644); + if (fd == -1) { + pr_perror("Unable to open a test file"); + return -1; + } + + addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Unable to map the test file"); + return -1; + } + + if (get_file_dev_and_inode(addr, &mstx)) + return -1; + if (fstat(fd, &st)) { + pr_perror("stat"); + return -1; + } + stx.stx_dev_major = major(st.st_dev); + stx.stx_dev_minor = minor(st.st_dev); + stx.stx_ino = st.st_ino; + + if (stx.stx_dev_major != mstx.stx_dev_major || + stx.stx_dev_minor != mstx.stx_dev_minor || + stx.stx_ino != mstx.stx_ino) { + pr_err("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n", + mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino, + stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino); + return -1; + } + + return 0; +} + +static int check_overlayfs_maps(void) +{ + pid_t pid; + int status; + + pid = fork(); + if (pid == -1) { + pr_perror("Unable to fork a child"); + return -1; + } + if (pid == 0) { + if (do_check_overlayfs_maps()) + exit(1); + exit(0); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid"); + return -1; + } + return status == 0 ? 0 : -1; +} + static int (*chk_feature)(void); /* @@ -1511,6 +1703,7 @@ int cr_check(void) ret |= check_ptrace_get_rseq_conf(); ret |= check_ipv6_freebind(); ret |= check_pagemap_scan(); + ret |= check_overlayfs_maps(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1633,6 +1826,7 @@ static struct feature_list feature_list[] = { { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, { "pagemap_scan", check_pagemap_scan }, + { "overlayfs_maps", check_overlayfs_maps }, { NULL, NULL }, }; diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 2fdecbc973..c50dc4174a 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -290,10 +290,16 @@ ip net add test # Check if cap_checkpoint_restore is supported and also if unshare -c is supported. # # Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312). -# This is a temporary workaround until fixed in the kernel. -# The kernel currently does not show correct device and inode numbers in /proc/pid/maps -# for stackable file systems. -if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true && [ ! -e /run/.containerenv ]; then +# Before v6.8-rc1~215^2~6, the kernel currently did not show correct device and +# inode numbers in /proc/pid/maps for stackable file systems. +skip=0 +findmnt -no FSTYPE / | grep overlay && { + ./criu/criu check --feature overlayfs_maps || skip=1 +} +unshare -c /bin/true || skip=1 +capsh --supports=cap_checkpoint_restore || skip=1 + +if [ "$skip" == 0 ]; then make -C test/zdtm/ cleanout rm -rf test/dump setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu From 21c8f72dd3a268c672eacdc08c8a32f7738fdeab Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 26 Mar 2024 10:16:40 +0000 Subject: [PATCH 235/321] ci: update base OS to ubuntu 22.04 Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 2 +- .github/workflows/archlinux-test.yml | 2 +- .github/workflows/compat-test.yml | 2 +- .github/workflows/fedora-asan-test.yml | 2 +- .github/workflows/fedora-rawhide-test.yml | 2 +- .github/workflows/gcov-test.yml | 2 +- .github/workflows/java-test.yml | 2 +- .github/workflows/podman-test.yml | 2 +- .github/workflows/stream-test.yml | 2 +- .github/workflows/x86-64-clang-test.yml | 2 +- .github/workflows/x86-64-gcc-test.yml | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 06f466c519..5757fa82b8 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: target: [GCC=1, CLANG=1] diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml index 328cc9d0f7..9e8b601369 100644 --- a/.github/workflows/archlinux-test.yml +++ b/.github/workflows/archlinux-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run Arch Linux Test diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml index 79f8f00105..e8b5a897bb 100644 --- a/.github/workflows/compat-test.yml +++ b/.github/workflows/compat-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: target: [GCC, CLANG] diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml index 8b1bfcf323..11233f4575 100644 --- a/.github/workflows/fedora-asan-test.yml +++ b/.github/workflows/fedora-asan-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index 5355aa1926..fae544900c 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index fcab478371..f221fabb5b 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml index abed793bf3..af1f710466 100644 --- a/.github/workflows/java-test.yml +++ b/.github/workflows/java-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run Java Test diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml index a7013a216f..077cf63e28 100644 --- a/.github/workflows/podman-test.yml +++ b/.github/workflows/podman-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run Podman Test diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml index 0f5b307db9..efb217e16b 100644 --- a/.github/workflows/stream-test.yml +++ b/.github/workflows/stream-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml index b3b50829a4..c9a1d3151b 100644 --- a/.github/workflows/x86-64-clang-test.yml +++ b/.github/workflows/x86-64-clang-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run X86_64 CLANG Test diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml index ec70b61fb1..8d1815d577 100644 --- a/.github/workflows/x86-64-gcc-test.yml +++ b/.github/workflows/x86-64-gcc-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run X86_64 GCC Test From 654fed9f67812f53848ec5742fe5b9cbb576915d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 26 Mar 2024 10:17:44 +0000 Subject: [PATCH 236/321] ci: update actions/checkout to v4 Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 2 +- .github/workflows/archlinux-test.yml | 2 +- .github/workflows/compat-test.yml | 2 +- .github/workflows/cross-compile-daily.yml | 2 +- .github/workflows/cross-compile.yml | 2 +- .github/workflows/docker-test.yml | 2 +- .github/workflows/fedora-asan-test.yml | 2 +- .github/workflows/fedora-rawhide-test.yml | 2 +- .github/workflows/gcov-test.yml | 2 +- .github/workflows/java-test.yml | 2 +- .github/workflows/lint.yml | 2 +- .github/workflows/loongarch64-qemu-test.yml | 2 +- .github/workflows/podman-test.yml | 2 +- .github/workflows/stream-test.yml | 2 +- .github/workflows/x86-64-clang-test.yml | 2 +- .github/workflows/x86-64-gcc-test.yml | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 5757fa82b8..73530d79ae 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -15,6 +15,6 @@ jobs: target: [GCC=1, CLANG=1] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Alpine ${{ matrix.target }} Test run: sudo -E make -C scripts/ci alpine ${{ matrix.target }} diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml index 9e8b601369..425f0662be 100644 --- a/.github/workflows/archlinux-test.yml +++ b/.github/workflows/archlinux-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Arch Linux Test run: sudo -E make -C scripts/ci archlinux diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml index e8b5a897bb..8a64ce1857 100644 --- a/.github/workflows/compat-test.yml +++ b/.github/workflows/compat-test.yml @@ -16,6 +16,6 @@ jobs: steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Compat Tests (${{ matrix.target }}) run: sudo -E make -C scripts/ci local COMPAT_TEST=y ${{ matrix.target }}=1 diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index 927ddced26..b8c8c86d48 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -14,7 +14,7 @@ jobs: branches: [criu-dev, master] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: ref: ${{ matrix.branches }} - name: Run Cross Compilation Targets diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index 4da5d397c6..06b8128231 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -33,7 +33,7 @@ jobs: target: mips64el-unstable-cross steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Cross Compilation Targets run: > sudo make -C scripts/ci ${{ matrix.target }} diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index 11d67432ba..23696905a3 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -14,6 +14,6 @@ jobs: matrix: os: [ubuntu-22.04] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Docker Test (${{ matrix.os }}) run: sudo make -C scripts/ci docker-test diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml index 11233f4575..02dc9a1b3f 100644 --- a/.github/workflows/fedora-asan-test.yml +++ b/.github/workflows/fedora-asan-test.yml @@ -12,6 +12,6 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Fedora ASAN Test run: sudo -E make -C scripts/ci fedora-asan diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index fae544900c..83e2ead825 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Fedora Rawhide Test # We need to pass environment variables from the CI environment to # distinguish between CI environments. However, we need to make sure that diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index f221fabb5b..cc4e1d44ac 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Coverage Tests run: sudo -E make -C scripts/ci local GCOV=1 - name: Run gcov diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml index af1f710466..cbd3c1f23f 100644 --- a/.github/workflows/java-test.yml +++ b/.github/workflows/java-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Java Test run: sudo make -C scripts/ci java-test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 4892594744..862d682458 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -16,7 +16,7 @@ jobs: - name: Install tools run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set git safe directory # https://github.com/actions/checkout/issues/760 diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml index ba22fa25ff..d7c554c872 100644 --- a/.github/workflows/loongarch64-qemu-test.yml +++ b/.github/workflows/loongarch64-qemu-test.yml @@ -11,5 +11,5 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml index 077cf63e28..a07edbe5b2 100644 --- a/.github/workflows/podman-test.yml +++ b/.github/workflows/podman-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Podman Test run: sudo make -C scripts/ci podman-test diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml index efb217e16b..76bd96edf7 100644 --- a/.github/workflows/stream-test.yml +++ b/.github/workflows/stream-test.yml @@ -12,6 +12,6 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run CRIU Image Streamer Test run: sudo -E make -C scripts/ci local STREAM_TEST=1 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml index c9a1d3151b..1f0a469bd5 100644 --- a/.github/workflows/x86-64-clang-test.yml +++ b/.github/workflows/x86-64-clang-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run X86_64 CLANG Test run: sudo make -C scripts/ci x86_64 CLANG=1 diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml index 8d1815d577..15e84a0dfc 100644 --- a/.github/workflows/x86-64-gcc-test.yml +++ b/.github/workflows/x86-64-gcc-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run X86_64 GCC Test run: sudo make -C scripts/ci x86_64 From 1a4c10388c40e8467453c2e24251c2ff44d500af Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 26 Mar 2024 13:01:16 +0000 Subject: [PATCH 237/321] ci/vdso01: fix typo Signed-off-by: Radostin Stoyanov --- test/zdtm/static/vdso01.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/vdso01.c b/test/zdtm/static/vdso01.c index d8d64155ad..4e33d30a8f 100644 --- a/test/zdtm/static/vdso01.c +++ b/test/zdtm/static/vdso01.c @@ -372,7 +372,7 @@ static int vdso_time_handler(void *func) t1 = time(NULL); t2 = vdso_time(NULL); - test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t1); + test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t2); if (labs(t1 - t2) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); From 18dcf15b3d7f8b92f550ef3ffa56cc285df4d4b6 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 16 Apr 2024 12:40:20 +0100 Subject: [PATCH 238/321] mem: optimize debug logging of enqueued pages During restore, CRIU prints "Enqueue page-read" messages for each page-read request [1]. However, this message does not provide useful information, increases performance overhead during restore and the size of log file. $ ./zdtm.py run -t zdtm/static/maps06 -f h -k always $ grep 'Enqueue page-read' dump/zdtm/static/maps06/56/1/restore.log | wc -l 20493 This commit replaces these log messages with a single message that shows the number of enqueued page-read requests. $ grep 'enqueued' dump/zdtm/static/maps06/56/1/restore.log (00.061449) 56: nr_enqueued: 20493 [1] https://github.com/checkpoint-restore/criu/commit/91388fc Signed-off-by: Radostin Stoyanov --- criu/mem.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/criu/mem.c b/criu/mem.c index 5f0d57eb66..c9578ef441 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1087,6 +1087,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned int nr_shared = 0; unsigned int nr_dropped = 0; unsigned int nr_compared = 0; + unsigned int nr_enqueued = 0; unsigned int nr_lazy = 0; unsigned long va; @@ -1162,7 +1163,8 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) len >>= PAGE_SHIFT; nr_restored += len; i += len - 1; - pr_debug("Enqueue page-read\n"); + + nr_enqueued++; continue; } @@ -1258,7 +1260,8 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) pr_info("nr_restored_pages: %d\n", nr_restored); pr_info("nr_shared_pages: %d\n", nr_shared); - pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_enqueued: %d\n", nr_enqueued); pr_info("nr_lazy: %d\n", nr_lazy); return 0; From c716c4df2aeb212de93c091b05205b7796719dae Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 16 Apr 2024 14:37:01 +0800 Subject: [PATCH 239/321] sk-tcp: cleanup dump_tcp_conn_state error handling 1) In dump_tcp_conn_state, if return from libsoccr_save is >=0, we check that sizeof(struct libsoccr_sk_data) returned from libsoccr_save is equal to sizeof(struct libsoccr_sk_data) we see in dump_tcp_conn_state (probably to check if we use the right library version). And if sizes are different we go to err_r, which just returns ret, which can teoretically be 0 (if size in library is zero) and that would lead dump_one_tcp treat this as success though it is obvious error. 2) In case of dump_opt or open_image fails we don't explicitly set ret and rely that sizeof(struct libsoccr_sk_data) previously set to ret is not 0, I don't really like it, it makes reading code too complex. 3) We have a lot of err_* labels which do exactly the same thing, there is no point in having all of them, also it is better to choose the name of the label based on what it really does. So let's refactor error handling to avoid these inconsistencies. Signed-off-by: Pavel Tikhomirov --- criu/sk-tcp.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 630a182a27..b8d9ba46e1 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -135,6 +135,7 @@ void cpt_unlock_tcp_connections(void) static int dump_tcp_conn_state(struct inet_sk_desc *sk) { struct libsoccr_sk *socr = sk->priv; + int exit_code = -1; int ret, aux; struct cr_img *img; TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; @@ -144,11 +145,11 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) ret = libsoccr_save(socr, &data, sizeof(data)); if (ret < 0) { pr_err("libsoccr_save() failed with %d\n", ret); - goto err_r; + goto err; } if (ret != sizeof(data)) { pr_err("This libsocr is not supported (%d vs %d)\n", ret, (int)sizeof(data)); - goto err_r; + goto err; } sk->state = data.state; @@ -190,7 +191,7 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) */ if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) - goto err_opt; + goto err; if (aux) { tse.has_nodelay = true; @@ -198,7 +199,7 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) } if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) - goto err_opt; + goto err; if (aux) { tse.has_cork = true; @@ -208,20 +209,19 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) /* * Push the stuff to image */ - img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino); if (!img) - goto err_img; + goto err; ret = pb_write_one(img, &tse, PB_TCP_STREAM); if (ret < 0) - goto err_iw; + goto err_close; buf = libsoccr_get_queue_bytes(socr, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); if (buf) { ret = write_img_buf(img, buf, tse.inq_len); if (ret < 0) - goto err_iw; + goto err_close; xfree(buf); } @@ -230,18 +230,17 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) if (buf) { ret = write_img_buf(img, buf, tse.outq_len); if (ret < 0) - goto err_iw; + goto err_close; xfree(buf); } pr_info("Done\n"); -err_iw: + exit_code = 0; +err_close: close_image(img); -err_img: -err_opt: -err_r: - return ret; +err: + return exit_code; } int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) From 37fbcc5ed6de97a0b917d65dab9f5da64e197591 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 9 May 2024 16:16:50 -0700 Subject: [PATCH 240/321] criu: fix a fatal failure if nft doesn't work On some systems, nft binary might not be installed, or some kernel options might be unconfigured, resulting in something like this: sudo unshare -n nft create table inet CRIU Error: Could not process rule: Operation not supported create table inet CRIU ^^^^^^^^^^^^^^^^^^^^^^^ This is similar to what kerndat_has_nftables_concat() does, and if the outcome is the same, it returns an error to kerndat_init(), and an error from kerndat_init() is considered fatal. Let's relax the check, returning mere "feature not working" instead of a fatal error. Signed-off-by: Kir Kolyshkin --- criu/kerndat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 6f4fea46b8..f899ef642c 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1602,7 +1602,9 @@ static int __has_nftables_concat(void *arg) return 1; if (NFT_RUN_CMD(nft, "create table inet CRIU")) { - pr_err("Can't create nftables table\n"); + pr_warn("Can't create nftables table\n"); + *has = false; /* kdat.has_nftables_concat = false */ + ret = 0; goto nft_ctx_free_out; } From de145790cebcb7c51d097425596ec9230a13dac6 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Sun, 19 May 2024 12:49:18 +0100 Subject: [PATCH 241/321] sk-tcp: Move TCP socket options from TcpStreamEntry to TcpOptsEntry Currently some of the TCP socket option information is stored in the TcpStreamEntry, but the information in the TcpStreamEntry is only restored after the TCP socket has established connection, which results in these TCP socket options not being restored for unconnected TCP sockets. In this commit move the TCP socket options from TcpStreamEntry to TcpOptsEntry and add dump_tcp_opts() and restore_tcp_opts() for TCP socket options dump and restore. Signed-off-by: Juntong Deng --- criu/include/sk-inet.h | 3 +++ criu/sk-inet.c | 18 +++++++++++++- criu/sk-tcp.c | 55 +++++++++++++++++++++++++---------------- images/sk-inet.proto | 2 ++ images/tcp-stream.proto | 6 +++++ 5 files changed, 62 insertions(+), 22 deletions(-) diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index b3a70fb27e..69ee8589e6 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -87,6 +87,9 @@ extern void cpt_unlock_tcp_connections(void); extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si); +extern int dump_tcp_opts(int sk, TcpOptsEntry *toe); +extern int restore_tcp_opts(int sk, TcpOptsEntry *toe); + #define SK_EST_PARAM "tcp-established" #define SK_INFLIGHT_PARAM "skip-in-flight" #define SK_CLOSE_PARAM "tcp-close" diff --git a/criu/sk-inet.c b/criu/sk-inet.c index a6a767c73f..92f53e5697 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -454,6 +454,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT; IpOptsRawEntry ipopts_raw = IP_OPTS_RAW_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; + TcpOptsEntry tcpopts = TCP_OPTS_ENTRY__INIT; int ret = -1, err = -1, proto, aux, type; ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL, &proto, sizeof(proto)); @@ -521,6 +522,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa ie.opts = &skopts; ie.ip_opts = &ipopts; ie.ip_opts->raw = &ipopts_raw; + ie.tcp_opts = &tcpopts; ie.n_src_addr = PB_ALEN_INET; ie.n_dst_addr = PB_ALEN_INET; @@ -581,9 +583,20 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa switch (proto) { case IPPROTO_TCP: - err = (type != SOCK_RAW) ? dump_one_tcp(lfd, sk, &skopts) : 0; if (sk->shutdown) sk_encode_shutdown(&ie, sk->shutdown); + + if (type == SOCK_RAW) { + err = 0; + } else { + err = dump_tcp_opts(lfd, &tcpopts); + if (err < 0) + goto err; + + err = dump_one_tcp(lfd, sk, &skopts); + if (err < 0) + goto err; + } break; case IPPROTO_UDP: case IPPROTO_UDPLITE: @@ -939,6 +952,9 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) if (restore_socket_opts(sk, ie->opts)) goto err; + if (ie->proto == IPPROTO_TCP && restore_tcp_opts(sk, ie->tcp_opts)) + goto err; + if (ie->has_shutdown && (ie->proto == IPPROTO_UDP || ie->proto == IPPROTO_UDPLITE || ie->proto == IPPROTO_TCP)) { if (shutdown(sk, sk_decode_shutdown(ie->shutdown))) { diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index b8d9ba46e1..f80a4cb9c0 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -136,7 +136,7 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) { struct libsoccr_sk *socr = sk->priv; int exit_code = -1; - int ret, aux; + int ret; struct cr_img *img; TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; char *buf; @@ -186,26 +186,6 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) tse.rcv_wup = data.rcv_wup; } - /* - * TCP socket options - */ - - if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) - goto err; - - if (aux) { - tse.has_nodelay = true; - tse.nodelay = true; - } - - if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) - goto err; - - if (aux) { - tse.has_cork = true; - tse.cork = true; - } - /* * Push the stuff to image */ @@ -243,6 +223,19 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) return exit_code; } +int dump_tcp_opts(int fd, TcpOptsEntry *toe) +{ + int ret = 0; + + ret |= dump_opt(fd, SOL_TCP, TCP_NODELAY, &toe->nodelay); + ret |= dump_opt(fd, SOL_TCP, TCP_CORK, &toe->cork); + + toe->has_nodelay = !!toe->nodelay; + toe->has_cork = !!toe->cork; + + return ret; +} + int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) { soe->has_tcp_keepcnt = true; @@ -396,6 +389,11 @@ static int restore_tcp_conn_state(int sk, struct libsoccr_sk *socr, struct inet_ if (libsoccr_restore(socr, &data, sizeof(data))) goto err_c; + /* + * Restoring TCP socket options in TcpStreamEntry is + * for backward compatibility only, newer versions + * of CRIU use TcpOptsEntry. + */ if (tse->has_nodelay && tse->nodelay) { aux = 1; if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux)) @@ -448,6 +446,21 @@ int prepare_tcp_socks(struct task_restore_args *ta) return 0; } +int restore_tcp_opts(int sk, TcpOptsEntry *toe) +{ + int ret = 0; + + if(!toe) + return ret; + + if (toe->has_nodelay) + ret |= restore_opt(sk, SOL_TCP, TCP_NODELAY, &toe->nodelay); + if (toe->has_cork) + ret |= restore_opt(sk, SOL_TCP, TCP_CORK, &toe->cork); + + return ret; +} + int restore_one_tcp(int fd, struct inet_sk_info *ii) { struct libsoccr_sk *sk; diff --git a/images/sk-inet.proto b/images/sk-inet.proto index 03a679e7fa..2c709e0181 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -5,6 +5,7 @@ syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; +import "tcp-stream.proto"; message ip_opts_raw_entry { optional bool hdrincl = 1; @@ -56,4 +57,5 @@ message inet_sk_entry { optional string ifname = 17; optional uint32 ns_id = 18; optional sk_shutdown shutdown = 19; + optional tcp_opts_entry tcp_opts = 20; } diff --git a/images/tcp-stream.proto b/images/tcp-stream.proto index c2244ba3bf..4f85282e29 100644 --- a/images/tcp-stream.proto +++ b/images/tcp-stream.proto @@ -4,6 +4,11 @@ syntax = "proto2"; import "opts.proto"; +message tcp_opts_entry { + optional bool cork = 1; + optional bool nodelay = 2; +} + message tcp_stream_entry { required uint32 inq_len = 1; required uint32 inq_seq = 2; @@ -16,6 +21,7 @@ message tcp_stream_entry { optional uint32 rcv_wscale = 8; optional uint32 timestamp = 9; + /* These two are deprecated, use tcp_opts_entry instead */ optional bool cork = 10; optional bool nodelay = 11; From 277878b346b2b4e51aa8861388a3ae8c8e2905e3 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Sun, 19 May 2024 12:55:02 +0100 Subject: [PATCH 242/321] sk-tcp: Move TCP socket options from SkOptsEntry to TcpOptsEntry Currently some TCP socket option information is stored in SkOptsEntry, which is a little confusing. SkOptsEntry should only contain socket options that are common to all sockets. In this commit move the TCP-specific socket options from SkOptsEntry to TcpOptsEntry. Signed-off-by: Juntong Deng --- criu/sk-tcp.c | 30 ++++++++++++------------------ criu/sockets.c | 6 ++++++ images/sk-opts.proto | 3 +++ images/tcp-stream.proto | 3 +++ 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index f80a4cb9c0..9c8bad1c3f 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -229,33 +229,21 @@ int dump_tcp_opts(int fd, TcpOptsEntry *toe) ret |= dump_opt(fd, SOL_TCP, TCP_NODELAY, &toe->nodelay); ret |= dump_opt(fd, SOL_TCP, TCP_CORK, &toe->cork); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); toe->has_nodelay = !!toe->nodelay; toe->has_cork = !!toe->cork; + toe->has_keepcnt = !!toe->keepcnt; + toe->has_keepidle = !!toe->keepidle; + toe->has_keepintvl = !!toe->keepintvl; return ret; } int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) { - soe->has_tcp_keepcnt = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt)) { - pr_perror("Can't read TCP_KEEPCNT"); - return -1; - } - - soe->has_tcp_keepidle = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &soe->tcp_keepidle)) { - pr_perror("Can't read TCP_KEEPIDLE"); - return -1; - } - - soe->has_tcp_keepintvl = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &soe->tcp_keepintvl)) { - pr_perror("Can't read TCP_KEEPINTVL"); - return -1; - } - if (sk->dst_port == 0) return 0; @@ -457,6 +445,12 @@ int restore_tcp_opts(int sk, TcpOptsEntry *toe) ret |= restore_opt(sk, SOL_TCP, TCP_NODELAY, &toe->nodelay); if (toe->has_cork) ret |= restore_opt(sk, SOL_TCP, TCP_CORK, &toe->cork); + if (toe->has_keepcnt) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); + if (toe->has_keepidle) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); + if (toe->has_keepintvl) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); return ret; } diff --git a/criu/sockets.c b/criu/sockets.c index 560c765175..f9ce999bed 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -585,6 +585,12 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_debug("\tset keepalive for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_KEEPALIVE, &val); } + + /* + * Restoring TCP socket options in SkOptsEntry is + * for backward compatibility only, newer versions + * of CRIU use TcpOptsEntry. + */ if (soe->has_tcp_keepcnt) { pr_debug("\tset keepcnt for socket\n"); ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt); diff --git a/images/sk-opts.proto b/images/sk-opts.proto index 1d24d47cc7..2f9d4e5c3c 100644 --- a/images/sk-opts.proto +++ b/images/sk-opts.proto @@ -26,9 +26,12 @@ message sk_opts_entry { optional bool so_reuseport = 17; optional bool so_broadcast = 18; optional bool so_keepalive = 19; + + /* These three are deprecated, use tcp_opts_entry instead */ optional uint32 tcp_keepcnt = 20; optional uint32 tcp_keepidle = 21; optional uint32 tcp_keepintvl = 22; + optional uint32 so_oobinline = 23; optional uint32 so_linger = 24; diff --git a/images/tcp-stream.proto b/images/tcp-stream.proto index 4f85282e29..3d834159fb 100644 --- a/images/tcp-stream.proto +++ b/images/tcp-stream.proto @@ -7,6 +7,9 @@ import "opts.proto"; message tcp_opts_entry { optional bool cork = 1; optional bool nodelay = 2; + optional uint32 keepcnt = 3; + optional uint32 keepidle = 4; + optional uint32 keepintvl = 5; } message tcp_stream_entry { From 516b3698a7613b9d2d02b1d62a32fd1f015a32e6 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Sun, 19 May 2024 12:57:17 +0100 Subject: [PATCH 243/321] sk-tcp: Add test cases for TCP_CORK and TCP_NODELAY socket options Currently there are no socket option test cases for TCP_CORK and TCP_NODELAY, this commit adds related test cases. The socket option test cases for TCP_KEEPCNT, TCP_KEEPIDLE, and TCP_KEEPINTVL already exist in socket-tcp_keepalive.c, so they are not included in this test case. Signed-off-by: Juntong Deng --- test/zdtm/static/Makefile | 3 + test/zdtm/static/sock_tcp_opts00.c | 96 +++++++++++++++++++++++++++ test/zdtm/static/sock_tcp_opts00.desc | 1 + test/zdtm/static/sock_tcp_opts01.c | 1 + test/zdtm/static/sock_tcp_opts01.desc | 1 + 5 files changed, 102 insertions(+) create mode 100644 test/zdtm/static/sock_tcp_opts00.c create mode 100644 test/zdtm/static/sock_tcp_opts00.desc create mode 120000 test/zdtm/static/sock_tcp_opts01.c create mode 120000 test/zdtm/static/sock_tcp_opts01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 548cefac28..1e891f0ba4 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -127,6 +127,8 @@ TST_NOFILE := \ sock_opts02 \ sock_ip_opts00 \ sock_ip_opts01 \ + sock_tcp_opts00 \ + sock_tcp_opts01 \ sk-unix-unconn \ sk-unix-unconn-seqpacket \ ipc_namespace \ @@ -609,6 +611,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO +sock_tcp_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS mnt_ext_manual: CFLAGS += -D ZDTM_EXTMAP_MANUAL mntns_pivot_root_ro: CFLAGS += -DMNTNS_PIVOT_ROOT_RO diff --git a/test/zdtm/static/sock_tcp_opts00.c b/test/zdtm/static/sock_tcp_opts00.c new file mode 100644 index 0000000000..8061bc9ea1 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts00.c @@ -0,0 +1,96 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that different tcp socket options are restored"; +const char *test_author = "Juntong Deng "; + +#ifdef ZDTM_VAL_ZERO +#define TCP_OPT_VAL 0 +#else +#define TCP_OPT_VAL 1 +#endif + +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + +struct sk_opt { + int level; + int opt; + int val; +}; + +struct sk_opt tcp_sk_opts[] = { + { SOL_TCP, TCP_CORK, TCP_OPT_VAL }, + { SOL_TCP, TCP_NODELAY, TCP_OPT_VAL }, +}; + +struct sk_conf { + int domain; + int type; + int protocol; + int sk; +} sk_confs[] = { + { AF_INET, SOCK_STREAM, IPPROTO_TCP }, + { AF_INET6, SOCK_STREAM, IPPROTO_TCP }, +}; + +int main(int argc, char **argv) +{ + struct sk_opt *opts = tcp_sk_opts; + int n_opts = ARRAY_SIZE(tcp_sk_opts); + int exit_code = 1; + int i, j, val; + socklen_t len; + + test_init(argc, argv); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + sk_confs[i].sk = socket(sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol); + if (sk_confs[i].sk == -1) { + pr_perror("socket(%d,%d,%d) failed", sk_confs[i].domain, sk_confs[i].type, + sk_confs[i].protocol); + goto close; + } + } + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + for (j = 0; j < n_opts; j++) { + val = opts[j].val; + if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { + pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + for (j = 0; j < n_opts; j++) { + len = sizeof(int); + if (getsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, &len) == -1) { + pr_perror("getsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + + if (val != opts[j].val) { + fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, + sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); + goto close; + } + } + } + + pass(); + exit_code = 0; +close: + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) + close(sk_confs[i].sk); + return exit_code; +} diff --git a/test/zdtm/static/sock_tcp_opts00.desc b/test/zdtm/static/sock_tcp_opts00.desc new file mode 100644 index 0000000000..2eac7e654b --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/test/zdtm/static/sock_tcp_opts01.c b/test/zdtm/static/sock_tcp_opts01.c new file mode 120000 index 0000000000..5219c2e989 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts01.c @@ -0,0 +1 @@ +./sock_tcp_opts00.c \ No newline at end of file diff --git a/test/zdtm/static/sock_tcp_opts01.desc b/test/zdtm/static/sock_tcp_opts01.desc new file mode 120000 index 0000000000..fb1dfdcd13 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts01.desc @@ -0,0 +1 @@ +./sock_tcp_opts00.desc \ No newline at end of file From 0f3246a7c3dba9904219437a4b8c5ad7fc8a04e7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 May 2024 17:17:03 +0100 Subject: [PATCH 244/321] mount: fix unbounded write Replace sprintf() with snprintf() and specify maximum length of characters to avoid potential overflow. Reported-by: GitHub CodeQL (https://codeql.github.com/) Signed-off-by: Radostin Stoyanov --- criu/mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/mount.c b/criu/mount.c index afbd242810..82bbd52d6c 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -98,7 +98,7 @@ static char *ext_mount_lookup(char *key) int len = strlen(key); char mkey[len + 6]; - sprintf(mkey, "mnt[%s]", key); + snprintf(mkey, sizeof(mkey), "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; From f6d635c2f510cb8549ec669a16140addc7174322 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 May 2024 19:13:01 +0100 Subject: [PATCH 245/321] test/make: remove unused target A fault-injection test was introduced in commit [1] and later removed in commit [2]. This patch removes the obsolete Makefile target. [1] b95407e264fcf58f4f73f78abef6dac60436e7dd test: check, that parasite can rollback itself (v2) [2] 2cb4532e266d0c9f8e87839d5b5eb728a3e4d10d tests: remove zdtm.sh (v2) Signed-off-by: Radostin Stoyanov --- test/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/Makefile b/test/Makefile index 5784b6a495..0bfdab6802 100644 --- a/test/Makefile +++ b/test/Makefile @@ -45,10 +45,6 @@ zdtm-freezer: ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:f .PHONY: zdtm-freezer -fault-injection: - $(MAKE) -C fault-injection -.PHONY: fault-injection - override CFLAGS += -D_GNU_SOURCE clean_root: From 86312285f53d66422fe542c81e658ede8c463603 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 21 May 2024 09:25:30 +0100 Subject: [PATCH 246/321] ci: update check for SELinux The rawhide tests runs in a container. Containers always have SELinux disabled from the inside. Somehow /sys/fs/selinux is now mounted. We used the existence of that directory if SELinux is available. This seems to be no longer true. Signed-off-by: Adrian Reber Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index c50dc4174a..8ee734fbc0 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -306,14 +306,19 @@ if [ "$skip" == 0 ]; then if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. selinuxmode=$(getenforce) - setenforce Permissive + if [ "$selinuxmode" != "Disabled" ]; then + setenforce Permissive + fi + fi # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then - setenforce "$selinuxmode" + if [ "$selinuxmode" != "Disabled" ]; then + setenforce "$selinuxmode" + fi fi setcap -r criu/criu else From 1a848fea5636263573ef32ddb1829a226c80bb37 Mon Sep 17 00:00:00 2001 From: Arnav Bhatt Date: Sun, 10 Mar 2024 13:13:12 +0530 Subject: [PATCH 247/321] criu: move sigact dump/restore code into sigact.c Seperate sigact dump/restore code from cr-restore.c and parasite-syscall.c into sigact.c Signed-off-by: Arnav Bhatt --- criu/Makefile.crtools | 1 + criu/cr-dump.c | 1 + criu/cr-restore.c | 263 +------------------------- criu/include/parasite-syscall.h | 2 - criu/include/sigact.h | 14 ++ criu/parasite-syscall.c | 51 ----- criu/sigact.c | 319 ++++++++++++++++++++++++++++++++ 7 files changed, 336 insertions(+), 315 deletions(-) create mode 100644 criu/include/sigact.h create mode 100644 criu/sigact.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index bf17f1ec9d..3ddf45cd70 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -93,6 +93,7 @@ obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o obj-y += timer.o +obj-y += sigact.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/cr-dump.c b/criu/cr-dump.c index a29ec82eff..199ff2e322 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -87,6 +87,7 @@ #include "apparmor.h" #include "asm/dump.h" #include "timer.h" +#include "sigact.h" /* * Architectures can overwrite this function to restore register sets that diff --git a/criu/cr-restore.c b/criu/cr-restore.c index c19a20b46f..deecb12946 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -99,6 +99,7 @@ #include "cr-errno.h" #include "timer.h" +#include "sigact.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -407,268 +408,6 @@ static int populate_pid_proc(void) return 0; } -static rt_sigaction_t sigchld_act; -/* - * If parent's sigaction has blocked SIGKILL (which is non-sense), - * this parent action is non-valid and shouldn't be inherited. - * Used to mark parent_act* no more valid. - */ -static rt_sigaction_t parent_act[SIGMAX]; -#ifdef CONFIG_COMPAT -static rt_sigaction_t_compat parent_act_compat[SIGMAX]; -#endif - -static bool sa_inherited(int sig, rt_sigaction_t *sa) -{ - rt_sigaction_t *pa; - int i; - - if (current == root_item) - return false; /* XXX -- inherit from CRIU? */ - - pa = &parent_act[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_native_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); -#ifdef CONFIG_MIPS - e->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); - - memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); - memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); -#else - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); -#endif - if (sig == SIGCHLD) { - sigchld_act = act; - return 0; - } - - if (sa_inherited(sig - 1, &act)) - return 1; - - /* - * A pure syscall is used, because glibc - * sigaction overwrites se_restorer. - */ - ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); - if (ret < 0) { - pr_perror("Can't restore sigaction"); - return ret; - } - - parent_act[sig - 1] = act; - /* Mark SIGKILL blocked which makes compat sigaction non-valid */ -#ifdef CONFIG_COMPAT - parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; -#endif - - return 1; -} - -static void *stack32; - -#ifdef CONFIG_COMPAT -static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) -{ - rt_sigaction_t_compat *pa; - int i; - - if (current == root_item) - return false; - - pa = &parent_act_compat[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t_compat act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); - - if (sig == SIGCHLD) { - memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); - return 0; - } - - if (sa_compat_inherited(sig - 1, &act)) - return 1; - - if (!stack32) { - stack32 = alloc_compat_syscall_stack(); - if (!stack32) - return -1; - } - - ret = arch_compat_rt_sigaction(stack32, sig, &act); - if (ret < 0) { - pr_err("Can't restore compat sigaction: %d\n", ret); - return ret; - } - - parent_act_compat[sig - 1] = act; - /* Mark SIGKILL blocked which makes native sigaction non-valid */ - parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; - - return 1; -} -#else -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - return -1; -} -#endif - -static int prepare_sigactions_from_core(TaskCoreEntry *tc) -{ - int sig, i; - - if (tc->n_sigactions != SIGMAX - 2) { - pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); - return -1; - } - - pr_info("Restore on-core sigactions for %d\n", vpid(current)); - - for (sig = 1, i = 0; sig <= SIGMAX; sig++) { - int ret; - SaEntry *e; - bool sigaction_is_compat; - - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - e = tc->sigactions[i++]; - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - if (ret < 0) - return ret; - } - - return 0; -} - -/* Returns number of restored signals, -1 or negative errno on fail */ -static int restore_one_sigaction(int sig, struct cr_img *img, int pid) -{ - bool sigaction_is_compat; - SaEntry *e; - int ret = 0; - - BUG_ON(sig == SIGKILL || sig == SIGSTOP); - - ret = pb_read_one_eof(img, &e, PB_SIGACT); - if (ret == 0) { - if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ - pr_err("Unexpected EOF %d\n", sig); - return -1; - } - pr_warn("This format of sigacts-%d.img is deprecated\n", pid); - return -1; - } - if (ret < 0) - return ret; - - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - sa_entry__free_unpacked(e, NULL); - - return ret; -} - -static int prepare_sigactions_from_image(void) -{ - int pid = vpid(current); - struct cr_img *img; - int sig, rst = 0; - int ret = 0; - - pr_info("Restore sigacts for %d\n", pid); - - img = open_image(CR_FD_SIGACT, O_RSTR, pid); - if (!img) - return -1; - - for (sig = 1; sig <= SIGMAX; sig++) { - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - ret = restore_one_sigaction(sig, img, pid); - if (ret < 0) - break; - if (ret) - rst++; - } - - pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); - - close_image(img); - return ret; -} - -static int prepare_sigactions(CoreEntry *core) -{ - int ret; - - if (!task_alive(current)) - return 0; - - if (core->tc->n_sigactions != 0) - ret = prepare_sigactions_from_core(core->tc); - else - ret = prepare_sigactions_from_image(); - - if (stack32) { - free_compat_syscall_stack(stack32); - stack32 = NULL; - } - - return ret; -} - static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) { struct pstree_item *pi; diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index 70ecbb720f..4a8ec2fee6 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -21,8 +21,6 @@ struct rt_sigframe; struct parasite_ctl; struct parasite_thread_ctl; -extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); - extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core); diff --git a/criu/include/sigact.h b/criu/include/sigact.h new file mode 100644 index 0000000000..4df011f961 --- /dev/null +++ b/criu/include/sigact.h @@ -0,0 +1,14 @@ +#ifndef __CR_SIGACT_H__ +#define __CR_SIGACT_H__ + +#include "images/core.pb-c.h" + +extern rt_sigaction_t sigchld_act; + +struct parasite_ctl; +struct pstree_item; + +extern int prepare_sigactions(CoreEntry *core); +extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); + +#endif diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 6d2aa9c887..a88f8a66f2 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -9,7 +9,6 @@ #include "common/compiler.h" #include "types.h" #include "protobuf.h" -#include "images/sa.pb-c.h" #include "images/timer.pb-c.h" #include "images/creds.pb-c.h" #include "images/core.pb-c.h" @@ -228,56 +227,6 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit return dump_thread_core(pid, core, args); } -int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - TaskCoreEntry *tc = item->core[0]->tc; - struct parasite_dump_sa_args *args; - int ret, sig; - SaEntry *sa, **psa; - - args = compel_parasite_args(ctl, struct parasite_dump_sa_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); - if (ret < 0) - return ret; - - psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); - if (!psa) - return -1; - - sa = (SaEntry *)(psa + SIGMAX - 2); - - tc->n_sigactions = SIGMAX - 2; - tc->sigactions = psa; - - for (sig = 1; sig <= SIGMAX; sig++) { - int i = sig - 1; - - if (sig == SIGSTOP || sig == SIGKILL) - continue; - - sa_entry__init(sa); - ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); - ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); - ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); -#ifdef CONFIG_MIPS - sa->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); - memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); -#else - BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); -#endif - sa->has_compat_sigaction = true; - sa->compat_sigaction = !compel_mode_native(ctl); - - *(psa++) = sa++; - } - - return 0; -} - int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) { struct parasite_dump_misc *ma; diff --git a/criu/sigact.c b/criu/sigact.c new file mode 100644 index 0000000000..5174644d28 --- /dev/null +++ b/criu/sigact.c @@ -0,0 +1,319 @@ +#include "types.h" +#include "infect.h" +#include "protobuf.h" +#include "pstree.h" +#include "parasite.h" +#include "restorer.h" +#include "sigact.h" + +/* + * If parent's sigaction has blocked SIGKILL (which is non-sense), + * this parent action is non-valid and shouldn't be inherited. + * Used to mark parent_act* no more valid. + */ +static rt_sigaction_t parent_act[SIGMAX]; +#ifdef CONFIG_COMPAT +static rt_sigaction_t_compat parent_act_compat[SIGMAX]; +#endif + +static bool sa_inherited(int sig, rt_sigaction_t *sa) +{ + rt_sigaction_t *pa; + int i; + + if (current == root_item) + return false; /* XXX -- inherit from CRIU? */ + + pa = &parent_act[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static void *stack32; +rt_sigaction_t sigchld_act; + +#ifdef CONFIG_COMPAT +static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) +{ + rt_sigaction_t_compat *pa; + int i; + + if (current == root_item) + return false; + + pa = &parent_act_compat[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t_compat act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); + + if (sig == SIGCHLD) { + memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); + return 0; + } + + if (sa_compat_inherited(sig - 1, &act)) + return 1; + + if (!stack32) { + stack32 = alloc_compat_syscall_stack(); + if (!stack32) + return -1; + } + + ret = arch_compat_rt_sigaction(stack32, sig, &act); + if (ret < 0) { + pr_err("Can't restore compat sigaction: %d\n", ret); + return ret; + } + + parent_act_compat[sig - 1] = act; + /* Mark SIGKILL blocked which makes native sigaction non-valid */ + parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; + + return 1; +} +#else +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + return -1; +} +#endif + +static int restore_native_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); +#ifdef CONFIG_MIPS + e->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); + + memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); + memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); +#else + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); +#endif + if (sig == SIGCHLD) { + sigchld_act = act; + return 0; + } + + if (sa_inherited(sig - 1, &act)) + return 1; + + /* + * A pure syscall is used, because glibc + * sigaction overwrites se_restorer. + */ + ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); + if (ret < 0) { + pr_perror("Can't restore sigaction"); + return ret; + } + + parent_act[sig - 1] = act; + /* Mark SIGKILL blocked which makes compat sigaction non-valid */ +#ifdef CONFIG_COMPAT + parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; +#endif + + return 1; +} + +static int prepare_sigactions_from_core(TaskCoreEntry *tc) +{ + int sig, i; + + if (tc->n_sigactions != SIGMAX - 2) { + pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); + return -1; + } + + pr_info("Restore on-core sigactions for %d\n", vpid(current)); + + for (sig = 1, i = 0; sig <= SIGMAX; sig++) { + int ret; + SaEntry *e; + bool sigaction_is_compat; + + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + e = tc->sigactions[i++]; + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + if (ret < 0) + return ret; + } + + return 0; +} + +/* Returns number of restored signals, -1 or negative errno on fail */ +static int restore_one_sigaction(int sig, struct cr_img *img, int pid) +{ + bool sigaction_is_compat; + SaEntry *e; + int ret = 0; + + BUG_ON(sig == SIGKILL || sig == SIGSTOP); + + ret = pb_read_one_eof(img, &e, PB_SIGACT); + if (ret == 0) { + if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ + pr_err("Unexpected EOF %d\n", sig); + return -1; + } + pr_warn("This format of sigacts-%d.img is deprecated\n", pid); + return -1; + } + if (ret < 0) + return ret; + + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + sa_entry__free_unpacked(e, NULL); + + return ret; +} + +static int prepare_sigactions_from_image(void) +{ + int pid = vpid(current); + struct cr_img *img; + int sig, rst = 0; + int ret = 0; + + pr_info("Restore sigacts for %d\n", pid); + + img = open_image(CR_FD_SIGACT, O_RSTR, pid); + if (!img) + return -1; + + for (sig = 1; sig <= SIGMAX; sig++) { + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + ret = restore_one_sigaction(sig, img, pid); + if (ret < 0) + break; + if (ret) + rst++; + } + + pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); + + close_image(img); + return ret; +} + +int prepare_sigactions(CoreEntry *core) +{ + int ret; + + if (!task_alive(current)) + return 0; + + if (core->tc->n_sigactions != 0) + ret = prepare_sigactions_from_core(core->tc); + else + ret = prepare_sigactions_from_image(); + + if (stack32) { + free_compat_syscall_stack(stack32); + stack32 = NULL; + } + + return ret; +} + +int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + TaskCoreEntry *tc = item->core[0]->tc; + struct parasite_dump_sa_args *args; + int ret, sig; + SaEntry *sa, **psa; + + args = compel_parasite_args(ctl, struct parasite_dump_sa_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); + if (ret < 0) + return ret; + + psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); + if (!psa) + return -1; + + sa = (SaEntry *)(psa + SIGMAX - 2); + + tc->n_sigactions = SIGMAX - 2; + tc->sigactions = psa; + + for (sig = 1; sig <= SIGMAX; sig++) { + int i = sig - 1; + + if (sig == SIGSTOP || sig == SIGKILL) + continue; + + sa_entry__init(sa); + ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); + ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); + ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); +#ifdef CONFIG_MIPS + sa->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); + memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); +#else + BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); +#endif + sa->has_compat_sigaction = true; + sa->compat_sigaction = !compel_mode_native(ctl); + + *(psa++) = sa++; + } + + return 0; +} From 7de0b45729842d828a6cd05093bf6246d2f22a3e Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 26 May 2024 14:44:14 +0200 Subject: [PATCH 248/321] criu: use proper format-specified to accommodate time_t 64-bit change See also: https://wiki.debian.org/ReleaseGoals/64bit-time Signed-off-by: Alexander Mikhalitsyn --- criu/autofs.c | 4 ++-- criu/timens.c | 8 ++++---- criu/timer.c | 5 +++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/criu/autofs.c b/criu/autofs.c index 6a7d8db0df..a1775cbc96 100644 --- a/criu/autofs.c +++ b/criu/autofs.c @@ -658,7 +658,7 @@ static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) static int autofs_mnt_set_timeout(time_t timeout, const char *mnt_path, int mnt_fd) { - pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); + pr_info("%s: set timeout %" PRId64 " for %s\n", __func__, (int64_t)timeout, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); } @@ -770,7 +770,7 @@ static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout } if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { - pr_err("Failed to set timeout %ld for %s\n", timeout, mnt_path); + pr_err("Failed to set timeout %" PRId64 " for %s\n", (int64_t)timeout, mnt_path); return -1; } diff --git a/criu/timens.c b/criu/timens.c index 66c0c02a42..257782e5a5 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -96,8 +96,8 @@ int prepare_timens(int id) ts.tv_nsec = te->monotonic->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: monotonic %ld %ld\n", ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %ld %ld\n", CLOCK_MONOTONIC, ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: monotonic %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_MONOTONIC, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a monotonic clock offset"); goto err; } @@ -111,8 +111,8 @@ int prepare_timens(int id) ts.tv_nsec = te->boottime->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: boottime %ld %ld\n", ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %ld %ld\n", CLOCK_BOOTTIME, ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: boottime %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_BOOTTIME, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a boottime clock offset"); goto err; } diff --git a/criu/timer.c b/criu/timer.c index bdcb059cce..4b286635de 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -46,8 +46,9 @@ static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) return -1; } - pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, - val->it_interval.tv_sec, val->it_interval.tv_usec); + pr_info("Restored %s timer to %" PRId64 ".%ld -> %" PRId64 ".%ld\n", n, + (int64_t)val->it_value.tv_sec, val->it_value.tv_usec, + (int64_t)val->it_interval.tv_sec, val->it_interval.tv_usec); return 0; } From b384afa0d965e74e3b09467583aba671253140aa Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 25 May 2024 05:11:21 +0000 Subject: [PATCH 249/321] net: Fix TOCTOU race condition in unix_conf_op The unix_conf_op function reads the size of the sysctl entry array twice. gcc thinks that it can lead to a time-of-check to time-of-use (TOCTOU) race condition if the array size changes between the two reads. Fixes #2398 Signed-off-by: Andrei Vagin --- criu/net.c | 15 ++++++++------- scripts/build/Dockerfile.x86_64.hdr | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/criu/net.c b/criu/net.c index b5c4a6ee32..eee3311087 100644 --- a/criu/net.c +++ b/criu/net.c @@ -359,22 +359,23 @@ static int ipv6_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntr return net_conf_op(tgt, conf, n, op, "ipv6", req, path, ARRAY_SIZE(devconfs6), devconfs6, def_conf); } -static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) +static int unix_conf_op(SysctlEntry ***rconf, size_t *pn, int op) { int i, ret = -1, flags = 0; char path[ARRAY_SIZE(unix_conf_entries)][MAX_CONF_UNIX_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(unix_conf_entries)] = {}; SysctlEntry **conf = *rconf; + size_t n = *pn; - if (*n != ARRAY_SIZE(unix_conf_entries)) { - pr_err("unix: Unexpected entries in config (%zu %zu)\n", *n, ARRAY_SIZE(unix_conf_entries)); + if (n != ARRAY_SIZE(unix_conf_entries)) { + pr_err("unix: Unexpected entries in config (%zu %zu)\n", n, ARRAY_SIZE(unix_conf_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < *n; i++) { + for (i = 0; i < n; i++) { snprintf(path[i], MAX_CONF_UNIX_PATH, CONF_UNIX_FMT, unix_conf_entries[i]); req[i].name = path[i]; req[i].flags = flags; @@ -390,7 +391,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) } } - ret = sysctl_op(req, *n, op, CLONE_NEWNET); + ret = sysctl_op(req, n, op, CLONE_NEWNET); if (ret < 0) { pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", CONF_UNIX_BASE); return -1; @@ -399,7 +400,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) if (op == CTL_READ) { bool has_entries = false; - for (i = 0; i < *n; i++) { + for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { conf[i]->has_iarg = true; if (!has_entries) @@ -412,7 +413,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) * Unix conf is optional. */ if (!has_entries) { - *n = 0; + *pn = 0; *rconf = NULL; } } diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 32fc2978a5..566b4c9160 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,4 +1,4 @@ -FROM ubuntu:focal +FROM ubuntu:24.04 COPY scripts/ci/apt-install /bin/apt-install From fea38908c480dc937d81efde4f13db2108460636 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 23 May 2024 14:21:17 +0100 Subject: [PATCH 250/321] pagemap-cache: handle short reads It is possible for pread() to return fewer number of bytes than requested. In such case, we need to repeat the read operation with appropriate offset. Signed-off-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/pagemap-cache.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 978a6b1aca..f04a517de3 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -165,7 +165,7 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) int pmc_fill(pmc_t *pmc, u64 start, u64 end) { - size_t size_map; + size_t size_map, off; pmc->start = start; pmc->end = end; @@ -204,10 +204,17 @@ int pmc_fill(pmc_t *pmc, u64 start, u64 end) pmc->regs_idx = 0; pmc->end = args.walk_end; } else { - if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; + for (off = 0; off != size_map;) { + ssize_t ret; + char *ptr = (char *)pmc->map; + + ret = pread(pmc->fd, ptr + off, size_map - off, PAGEMAP_PFN_OFF(pmc->start) + off); + if (ret == -1) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } + off += ret; } } From f4a16a09906fd3f495ad5b1207d78778d994dabd Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 May 2024 07:57:14 +0100 Subject: [PATCH 251/321] zdtm: add support for LD_PRELOAD tests This commit adds a `--preload-libfault` option to ZDTM's run command. This option runs CRIU with LD_PRELOAD to intercept libc functions such as pread(). This method allows to simulate special cases, for example, when a successful call to pread() transfers fewer bytes than requested. Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 3 +++ test/libfault/Makefile | 21 +++++++++++++++++++++ test/libfault/libfault.c | 31 +++++++++++++++++++++++++++++++ test/zdtm.py | 23 +++++++++++++++++++++-- test/zdtm/criu_config.py | 1 + 5 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 test/libfault/Makefile create mode 100644 test/libfault/libfault.c diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 8ee734fbc0..ef2dffb1a4 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -262,6 +262,9 @@ make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling +./test/zdtm.py run -t zdtm/static/maps00 --preload-libfault +./test/zdtm.py run -t zdtm/static/maps02 --preload-libfault + ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server diff --git a/test/libfault/Makefile b/test/libfault/Makefile new file mode 100644 index 0000000000..cbe47fdf24 --- /dev/null +++ b/test/libfault/Makefile @@ -0,0 +1,21 @@ +CC = gcc +CFLAGS = -c -fPIC -ldl + +SRC = libfault.c +OBJ = $(SRC:.c=.o) + +LIB = libfault.so + +.PHONY: all clean run + +all: $(LIB) + +$(LIB): $(OBJ) + $(CC) -shared -o $(LIB) $(OBJ) + +$(OBJ): $(SRC) + $(CC) $(CFLAGS) $< + +clean: + rm -f $(OBJ) $(LIB) + diff --git a/test/libfault/libfault.c b/test/libfault/libfault.c new file mode 100644 index 0000000000..650bf08ca0 --- /dev/null +++ b/test/libfault/libfault.c @@ -0,0 +1,31 @@ +#define _GNU_SOURCE +#include +#include +#include + +ssize_t (*original_pread)(int fd, void *buf, size_t count, off_t offset) = NULL; + +/** + * This function is a wrapper around pread() that is used for testing CRIU's + * handling of cases where pread() returns less data than requested. + * + * pmc_fill() in criu/pagemap.c is a good example of where this can happen. + */ +ssize_t pread64(int fd, void *buf, size_t count, off_t offset) +{ + if (!original_pread) { + original_pread = dlsym(RTLD_NEXT, "pread"); + if (!original_pread) { + errno = EIO; + return -1; + } + } + + /* The following aims to simulate the case when pread() returns less + * data than requested. We need to ensure that CRIU handles such cases. */ + if (count > 2048) { + count -= 1024; + } + + return original_pread(fd, buf, count, offset); +} diff --git a/test/zdtm.py b/test/zdtm.py index 7a7cdfd3b6..fbb3400c42 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -32,6 +32,14 @@ # File to store content of streamed images STREAMED_IMG_FILE_NAME = "img.criu" +# A library used to preload C functions to simulate +# cases such as partial read with pread(). +LIBFAULT_PATH = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "libfault", + "libfault.so" +) + prev_line = None uuid = uuid.uuid4() @@ -628,6 +636,8 @@ def available(): ["make", "zdtm_ct"], env=dict(os.environ, MAKEFLAGS="")) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) + if 'preload_libfault' in opts and opts['preload_libfault']: + subprocess.check_call(["make", "-C", "libfault/"]) if 'rootless' in opts and opts['rootless']: return subprocess.check_call( @@ -880,6 +890,7 @@ def run(action, fault=None, strace=[], preexec=None, + preload_libfault=False, nowait=False, timeout=60): env = dict( @@ -890,6 +901,9 @@ def run(action, print("Forcing %s fault" % fault) env['CRIU_FAULT'] = fault + if preload_libfault: + env['LD_PRELOAD'] = LIBFAULT_PATH + cr = subprocess.Popen(strace + [criu_bin, action, "--no-default-config"] + args, env=env, @@ -980,6 +994,7 @@ def run(action, fault=None, strace=[], preexec=None, + preload_libfault=False, nowait=False, timeout=None): if fault: @@ -1065,6 +1080,7 @@ def __init__(self, opts): self.__criu_bin = opts['criu_bin'] self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] + self.__preload_libfault = bool(opts['preload_libfault']) self.__mntns_compat_mode = bool(opts['mntns_compat_mode']) if opts['rpc']: @@ -1192,8 +1208,10 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): with open("/proc/sys/kernel/ns_last_pid") as ns_last_pid_fd: ns_last_pid = ns_last_pid_fd.read() + preload_libfault = self.__preload_libfault and action in ['dump', 'pre-dump', 'restore'] + ret = self.__criu.run(action, s_args, self.__criu_bin, self.__fault, - strace, preexec, nowait) + strace, preexec, preload_libfault, nowait) if nowait: os.close(status_fds[1]) @@ -2083,7 +2101,7 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless') + 'rootless', 'preload_libfault') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2788,6 +2806,7 @@ def get_cli_args(): help="Select tests for a shard (0-based)") rp.add_argument("--test-shard-count", type=int, default=0, help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") + rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) diff --git a/test/zdtm/criu_config.py b/test/zdtm/criu_config.py index 487becfb4b..221c232929 100644 --- a/test/zdtm/criu_config.py +++ b/test/zdtm/criu_config.py @@ -11,6 +11,7 @@ def run(action, fault=None, strace=[], preexec=None, + preload=False, nowait=False): config_path = tempfile.mktemp(".conf", "criu-%s-" % action) From 9eaab45f9740b1f11d95c2a1ab9618150d16cb53 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Jun 2024 08:14:51 +0200 Subject: [PATCH 252/321] ci: remove CentOS Stream 8 test (EOL) Signed-off-by: Adrian Reber --- .cirrus.yml | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 72135590d9..5e30ca2c2b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -92,37 +92,6 @@ task: build_script: | make -C scripts/ci vagrant-fedora-non-root -task: - name: CentOS Stream 8 based test - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: centos-cloud - image: family/centos-stream-8 - platform: linux - cpu: 4 - memory: 8G - - setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - # Do not fail if latest epel repository definition is already installed - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : - yum install -y dnf-plugins-core - yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel - alternatives --set python /usr/bin/python3 - systemctl stop sssd - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - - build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" - task: name: aarch64 build GCC (native) arm_container: From f287a1a387f53323b0fd98ce9a339c2c650de481 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Fri, 29 Mar 2024 00:06:58 +0530 Subject: [PATCH 253/321] zdtm: Distinguish between fail and crash of dump Adds a exit_signal static method to criu_cli, criu_config and criu_rpc used to detect a crash. Fixes: #350 Signed-off-by: Bhavik Sachdev --- test/zdtm.py | 19 +++++++++++++++---- test/zdtm/criu_config.py | 4 ++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index fbb3400c42..df23ea03d9 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -913,6 +913,10 @@ def run(action, return cr return cr.wait(timeout=timeout) + @staticmethod + def exit_signal(ret): + return ret < 0 + class criu_rpc_process: def wait(self): @@ -1033,8 +1037,11 @@ def run(action, else: raise test_fail_exc('RPC for %s required' % action) except crpc.CRIUExceptionExternal as e: - print("Fail", e) - ret = -1 + if e.typ != e.resp_typ: + ret = -2 + else: + print("Fail", e) + ret = -1 else: ret = 0 @@ -1047,6 +1054,10 @@ def run(action, return ret + @staticmethod + def exit_signal(ret): + return ret == -2 + class criu: def __init__(self, opts): @@ -1251,8 +1262,8 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): return rst_succeeded = os.access( os.path.join(__ddir, "restore-succeeded"), os.F_OK) - if self.__test.blocking() or (self.__sat and action == 'restore' and - rst_succeeded): + if (self.__test.blocking() and not self.__criu.exit_signal(ret)) or \ + (self.__sat and action == 'restore' and rst_succeeded): raise test_fail_expected_exc(action) else: raise test_fail_exc("CRIU %s" % action) diff --git a/test/zdtm/criu_config.py b/test/zdtm/criu_config.py index 221c232929..9fd2927476 100644 --- a/test/zdtm/criu_config.py +++ b/test/zdtm/criu_config.py @@ -41,3 +41,7 @@ def run(action, if nowait: return cr return cr.wait() + + @staticmethod + def exit_signal(ret): + return ret < 0 From ced120ad194047f94c3697dd80401109e3614020 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Sat, 20 Apr 2024 00:24:27 +0530 Subject: [PATCH 254/321] test/dump-crash: check code path when dump crashes Signed-off-by: Bhavik Sachdev --- criu/cr-dump.c | 4 ++++ criu/include/fault-injection.h | 1 + test/jenkins/criu-fault.sh | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 199ff2e322..ef3b5480f8 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2102,6 +2102,10 @@ static int cr_dump_finish(int ret) close_image_dir(); if (ret || post_dump_ret) { + if (fault_injected(FI_DUMP_CRASH)) { + pr_info("fault: CRIU dump crashed!\n"); + abort(); + } pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index fe75dfe860..552ee43389 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -20,6 +20,7 @@ enum faults { FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, FI_DONT_USE_PAGEMAP_SCAN = 135, + FI_DUMP_CRASH = 136, FI_MAX, }; diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 4a6d55e6bf..1fda40a969 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -39,3 +39,7 @@ fi ./test/zdtm.py run -t zdtm/static/fpu03 --fault 134 -f h --norst || fail # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail + +if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then + fail +fi From b5e2025765b9dfc0b2f583478b7a6380452956b3 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 9 Jun 2024 18:25:28 +0200 Subject: [PATCH 255/321] ci: upgrade to Fedora 40 Vagrant images (38 is EOL) Signed-off-by: Adrian Reber --- scripts/ci/vagrant.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 4c1be35443..3904c51d22 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,9 +6,9 @@ set -e set -x -VAGRANT_VERSION=2.3.7 -FEDORA_VERSION=38 -FEDORA_BOX_VERSION=38.20230413.1 +VAGRANT_VERSION=2.4.1 +FEDORA_VERSION=40 +FEDORA_BOX_VERSION=40.20240414.0 setup() { if [ -n "$TRAVIS" ]; then @@ -39,7 +39,7 @@ setup() { ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel + rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default cat /proc/cmdline From 056712782e3b25501b49b3f44d1cefeeecd46009 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 May 2024 09:48:34 +0100 Subject: [PATCH 256/321] make: improve check for externally managed Python Move PYTHON_EXTERNALLY_MANAGED and PIP_BREAK_SYSTEM_PACKAGES into Makefile.install to avoid code duplication. In addition, add PIPFLAGS variable to enable specifying pip options during installation. This is particularly useful for packaging, where it is common for `pip install` to run in an environment with pre-installed dependencies and without internet access. In such environment, we need to specify the following options: --no-build-isolation --no-index --no-deps Signed-off-by: Radostin Stoyanov --- Makefile.install | 23 +++++++++++++++++++++++ crit/Makefile | 25 +++++-------------------- lib/Makefile | 25 +++++-------------------- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/Makefile.install b/Makefile.install index 6f5b31924d..680b26c62b 100644 --- a/Makefile.install +++ b/Makefile.install @@ -29,6 +29,29 @@ LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR +# Detect externally managed Python environment (PEP 668). +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES ?= 0 + +# If Python environment is externally managed and PIP_BREAK_SYSTEM_PACKAGES is not set, skip pip install. +SKIP_PIP_INSTALL := 0 +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + +SKIP_PIP_INSTALL := 1 +$(info Warn: Externally managed python environment) +$(info Consider using PIP_BREAK_SYSTEM_PACKAGES=1) + +endif +endif + +# Default flags for pip install: +# --upgrade: Upgrade crit/pycriu packages +# --ignore-installed: Ignore existing packages and reinstall them +PIPFLAGS ?= --upgrade --ignore-installed + +export SKIP_PIP_INSTALL PIPFLAGS + install-man: $(Q) $(MAKE) -C Documentation install .PHONY: install-man diff --git a/crit/Makefile b/crit/Makefile index 9a856db6d2..33bd68eedc 100644 --- a/crit/Makefile +++ b/crit/Makefile @@ -1,6 +1,3 @@ -PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') -PIP_BREAK_SYSTEM_PACKAGES := 0 - VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) all-y += ${VERSION_FILE} @@ -10,31 +7,19 @@ ${VERSION_FILE}: $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ install: ${VERSION_FILE} -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" - $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" -else +ifeq ($(SKIP_PIP_INSTALL),0) $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit -endif + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./crit else - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit + $(E) " SKIP INSTALL crit" endif .PHONY: install uninstall: -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" - $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" -else +ifeq ($(SKIP_PIP_INSTALL),0) $(E) " UNINSTALL" crit $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit -endif else - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit + $(E) " SKIP UNINSTALL crit" endif .PHONY: uninstall diff --git a/lib/Makefile b/lib/Makefile index ae371e78e0..4b8a6cbb83 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -4,9 +4,6 @@ UAPI_HEADERS := lib/c/criu.h images/rpc.proto images/rpc.pb-c.h criu/include/ve all-y += lib-c lib-a lib-py -PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') -PIP_BREAK_SYSTEM_PACKAGES := 0 - # # C language bindings. lib/c/Makefile: ; @@ -57,17 +54,11 @@ install: lib-c lib-a lib-py lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP INSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" - $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" -else +ifeq ($(SKIP_PIP_INSTALL),0) $(E) " INSTALL " pycriu - $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib -endif + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./lib else - $(E) " INSTALL " pycriu - $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib + $(E) " SKIP INSTALL pycriu" endif .PHONY: install @@ -80,16 +71,10 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP UNINSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" - $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" -else +ifeq ($(SKIP_PIP_INSTALL),0) $(E) " UNINSTALL" pycriu $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu -endif else - $(E) " UNINSTALL" pycriu - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu + $(E) " SKIP UNINSTALL pycriu" endif .PHONY: uninstall From bf8c134714d6e041261b5129561eeb49aa734ec1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 22 Jun 2024 18:34:47 +0100 Subject: [PATCH 257/321] readme: update link to FAQ page The current link opens a page with the following text: The MediaWiki FAQ can be found at: https://www.mediawiki.org/wiki/Special:MyLanguage/Manual:FAQ Signed-off-by: Radostin Stoyanov --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 11d1c490b6..f578e745c9 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Pages worth starting with are: - [Installation instructions](http://criu.org/Installation) - [A simple example of usage](http://criu.org/Simple_loop) - [Examples of more advanced usage](https://criu.org/Category:HOWTO) -- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) +- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/index.php?title=FAQ) ### Checkpoint and restore of simple loop process

From fc65e466e4eed2a264f3b91053305d26e13207f2 Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Thu, 6 Jun 2024 11:12:39 -0700 Subject: [PATCH 258/321] criu: Restore rseq_cs state slightly earlier in the restore sequence and run the plugin finalizer later in the dump sequence Restore rseq_cs state before calling RESUME_DEVICES_LATE as the CUDA plugin will temporarily unfreeze a thread during the plugin hook to assist with device restore Run the plugin finalizer later in the dump sequence since the finalizer is used by the CUDA plugin to handle some process cleanup Signed-off-by: Jesus Ramos --- criu/cr-dump.c | 4 +++- criu/cr-restore.c | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index ef3b5480f8..1bc5d934f5 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2035,7 +2035,6 @@ static int cr_dump_finish(int ret) if (bfd_flush_images()) ret = -1; - cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { @@ -2089,6 +2088,9 @@ static int cr_dump_finish(int ret) if (arch_set_thread_regs(root_item, true) < 0) return -1; + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index deecb12946..4db2f4ecfc 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2224,6 +2224,11 @@ static int restore_root_task(struct pstree_item *init) } finalize_restore(); + + /* just before releasing threads we have to restore rseq_cs */ + if (restore_rseq_cs()) + pr_err("Unable to restore rseq_cs state\n"); + /* * Some external devices such as GPUs might need a very late * trigger to kick-off some events, memory notifiers and for @@ -2255,10 +2260,6 @@ static int restore_root_task(struct pstree_item *init) if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); - /* just before releasing threads we have to restore rseq_cs */ - if (restore_rseq_cs()) - pr_err("Unable to restore rseq_cs state\n"); - /* Detaches from processes and they continue run through sigreturn. */ if (finalize_restore_detach()) goto out_kill_network_unlocked; From a85f488595e0a3a6e6cc6ca7c94d4a00b1341aaf Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Thu, 6 Jun 2024 11:16:07 -0700 Subject: [PATCH 259/321] criu/plugin: Introduce new plugin hooks PAUSE_DEVICES and CHECKPOINT_DEVICES to be used during pstree collection PAUSE_DEVICES is called before a process is frozen and is used by the CUDA plugin to place the process in a state that's ready to be checkpointed and quiesce any pending work CHECKPOINT_DEVICES is called after all processes in the tree have been frozen and PAUSE'd and performs the actual checkpointing operation for CUDA applications Signed-off-by: Jesus Ramos --- criu/include/criu-plugin.h | 6 ++++++ criu/plugin.c | 2 ++ criu/seize.c | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 886832eaaa..392ea9f534 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -56,6 +56,10 @@ enum { CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, + CR_PLUGIN_HOOK__PAUSE_DEVICES = 10, + + CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__MAX }; @@ -72,6 +76,8 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index f3fea28566..58b5ea5bfe 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -57,6 +57,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma"); __assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map"); __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); + __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); + __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); #undef __assign_hook diff --git a/criu/seize.c b/criu/seize.c index 91090ae1a7..d392259bc5 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -16,6 +16,7 @@ #include "pstree.h" #include "criu-log.h" #include +#include "plugin.h" #include "proc_parse.h" #include "seccomp.h" #include "seize.h" @@ -637,6 +638,11 @@ static int collect_children(struct pstree_item *item) goto free; } + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto free; + } + if (!opts.freeze_cgroup) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -966,6 +972,7 @@ int collect_pstree(void) pid_t pid = root_item->pid->real; int ret = -1; struct proc_status_creds creds; + struct pstree_item *iter; timing_start(TIME_FREEZING); @@ -984,6 +991,11 @@ int collect_pstree(void) if (opts.freeze_cgroup && freeze_processes()) goto err; + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; @@ -1017,6 +1029,12 @@ int collect_pstree(void) goto err; } + for_each_pstree_item(iter) { + ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } + ret = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); From c0708cbffbe3fe12037b1860097a8eaf50b45f56 Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Fri, 31 May 2024 13:38:54 -0700 Subject: [PATCH 260/321] criu/plugin: Add NVIDIA CUDA plugin Adding support for the NVIDIA cuda-checkpoint utility, requires the use of an r555 or higher driver along with the cuda-checkpoint binary. Signed-off-by: Jesus Ramos --- Makefile | 15 +- Makefile.install | 7 +- plugins/cuda/Makefile | 42 ++++ plugins/cuda/README.md | 59 +++++ plugins/cuda/cuda_plugin.c | 459 +++++++++++++++++++++++++++++++++++++ 5 files changed, 578 insertions(+), 4 deletions(-) create mode 100644 plugins/cuda/Makefile create mode 100644 plugins/cuda/README.md create mode 100644 plugins/cuda/cuda_plugin.c diff --git a/Makefile b/Makefile index 6a17a30b5a..172d4b5177 100644 --- a/Makefile +++ b/Makefile @@ -165,7 +165,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: flog criu lib crit +all: flog criu lib crit cuda_plugin .PHONY: all # @@ -308,15 +308,19 @@ clean-amdgpu_plugin: $(Q) $(MAKE) -C plugins/amdgpu clean .PHONY: clean-amdgpu_plugin +clean-cuda_plugin: + $(Q) $(MAKE) -C plugins/cuda clean +.PHONY: clean-cuda_plugin + clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top -clean: clean-top clean-amdgpu_plugin +clean: clean-top clean-amdgpu_plugin clean-cuda_plugin -mrproper-top: clean-top clean-amdgpu_plugin +mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) @@ -348,6 +352,10 @@ amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin +cuda_plugin: criu + $(Q) $(MAKE) -C plugins/cuda all +.PHONY: cuda_plugin + crit: lib $(Q) $(MAKE) -C crit .PHONY: crit @@ -434,6 +442,7 @@ help: @echo ' lint - Run code linters' @echo ' indent - Indent C code' @echo ' amdgpu_plugin - Make AMD GPU plugin' + @echo ' cuda_plugin - Make NVIDIA CUDA plugin' .PHONY: help ruff: diff --git a/Makefile.install b/Makefile.install index 680b26c62b..455735f3b1 100644 --- a/Makefile.install +++ b/Makefile.install @@ -72,12 +72,16 @@ install-amdgpu_plugin: amdgpu_plugin $(Q) $(MAKE) -C plugins/amdgpu install .PHONY: install-amdgpu_plugin +install-cuda_plugin: cuda_plugin + $(Q) $(MAKE) -C plugins/cuda install +.PHONY: install-cuda_plugin + install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ; .PHONY: install uninstall: @@ -88,4 +92,5 @@ uninstall: $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) -C plugins/amdgpu $@ + $(Q) $(MAKE) -C plugins/cuda $@ .PHONY: uninstall diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile new file mode 100644 index 0000000000..2eabc0e314 --- /dev/null +++ b/plugins/cuda/Makefile @@ -0,0 +1,42 @@ +PLUGIN_NAME := cuda_plugin +PLUGIN_SOBJ := cuda_plugin.so + +DEPS_CUDA := $(PLUGIN_SOBJ) + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ + +COMPEL := ../../compel/compel-host + +CC := gcc +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC + +__nmk_dir ?= ../../scripts/nmk/scripts/ +include $(__nmk_dir)msg.mk + +all: $(DEPS_CUDA) + +cuda_plugin.so: cuda_plugin.c + $(call msg-gen, $@) + $(Q) $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) + +clean: + $(call msg-clean, $@) + $(Q) $(RM) $(PLUGIN_SOBJ) +.PHONY: clean + +mrproper: clean + +install: + $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) + $(E) " INSTALL " $(PLUGIN_NAME) + $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) +.PHONY: install + +uninstall: + $(E) " UNINSTALL" $(PLUGIN_NAME) + $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) +.PHONY: uninstall + diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md new file mode 100644 index 0000000000..7b91f69989 --- /dev/null +++ b/plugins/cuda/README.md @@ -0,0 +1,59 @@ +Checkpoint and Restore for CUDA applications with CRIU +====================================================== + +# Requirements +The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555 +or higher GPU driver is required for CUDA CRIU integration support. + +## cuda-checkpoint +The cuda-checkpoint utility can be found at: +https://github.com/NVIDIA/cuda-checkpoint + +cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA +applications. Updating the cuda-checkpoint utility between driver releases +should not be necessary as the utility simply exposes some extra driver behavior +so driver updates are all that's needed to get access to newer features. + +# Checkpointing Procedure +cuda-checkpoint exposes 4 actions used in the checkpointing process: lock, +checkpoint, restore, unlock. + +* lock - Used with the PAUSE_DEVICES hook while a process is still running to + quiesce the application into a state where it can be checkpointed +* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been + seized/frozen to perform the actual checkpointing operation +* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA + state and release the process back to it's running state + +These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA +plugin will re-wake when needed. + +# Known Limitations +* Currently GPU memory contents are brought into main system memory and CRIU + then checkpoints that as part of the normal procedure. On systems with many + GPU's with high GPU memory usage this can cause memory thrashing. A future + CUDA release will add support for dumping the memory contents to files to + alleviate this as well as support in the CRIU plugin. +* There's currently a small race between when a PAUSE_DEVICES hook is called on + a running process and a process calls cuInit() and finishes initializing CUDA + after the PAUSE is issued but before the process is frozen to checkpoint. This + will cause cuda-checkpoint to report that the process is in an illegal state + for checkpointing and it's recommended to just attempt the CRIU procedure + again, this should be very rare. +* Applications that use NVML will leave some leftover device references as NVML + is not currently supported for checkpointing. There will be support for this + in later drivers. A possible temporary workaround is to have the + {DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N} + remaining references for these applications as in most cases NVML is used to + get info such as gpu count and some capabilities and these values are never + accessed again and unlikely to change. +* CUDA applications that fork() but don't call exec() but also don't issue any + CUDA API calls will have some leftover references to /dev/nvidia* and fail to + checkpoint as a result. This can be worked around in a similar fashion to the + NVML case where the leftover references can be ignored as CUDA is not fork() + safe anyway. +* Restore currently requires that you restore on a system with similar GPU's and + same GPU count. +* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process + Service) are currently not supported for checkpointing. Future CUDA releases + will add support for these. diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c new file mode 100644 index 0000000000..b3f2fc8df7 --- /dev/null +++ b/plugins/cuda/cuda_plugin.c @@ -0,0 +1,459 @@ +#include "criu-log.h" +#include "plugin.h" +#include "util.h" +#include "cr_options.h" +#include "pid.h" +#include "proc_parse.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +/* cuda-checkpoint binary should live in your PATH */ +#define CUDA_CHECKPOINT "cuda-checkpoint" + +/* cuda-checkpoint --action flags */ +#define ACTION_LOCK "lock" +#define ACTION_CHECKPOINT "checkpoint" +#define ACTION_RESTORE "restore" +#define ACTION_UNLOCK "unlock" + +#define CUDA_CKPT_BUF_SIZE (128) + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "cuda_plugin: " + +/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver + * version doesn't support --action flag + */ +bool plugin_disabled = false; + +struct pid_info { + int pid; + char checkpointed; + struct list_head list; +}; + +/* Used to track which PID's we've paused CUDA operations on so far so we can + * release them after we're done with the DUMP + */ +struct list_head cuda_pids; + +static void dealloc_pid_buffer(struct list_head *pid_buf) +{ + struct pid_info *info; + struct pid_info *n; + + list_for_each_entry_safe(info, n, pid_buf, list) { + list_del(&info->list); + xfree(info); + } +} + +static int add_pid_to_buf(struct list_head *pid_buf, int pid) +{ + struct pid_info *new = xmalloc(sizeof(*new)); + + if (new == NULL) { + return -1; + } + + new->pid = pid; + new->checkpointed = 0; + list_add_tail(&new->list, pid_buf); + + return 0; +} + +static int update_checkpointed_pid(struct list_head *pid_buf, int pid) +{ + struct pid_info *info; + + list_for_each_entry(info, pid_buf, list) { + if (info->pid == pid) { + info->checkpointed = 1; + return 0; + } + } + + return -1; +} + +static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) +{ +#define READ 0 +#define WRITE 1 + int fd[2]; + + if (pipe(fd) != 0) { + pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + return -1; + } + + buf[0] = '\0'; + + int child_pid = fork(); + if (child_pid == -1) { + pr_err("Failed to fork to exec cuda-checkpoint\n"); + close(fd[READ]); + close(fd[WRITE]); + return -1; + } + + if (child_pid == 0) { // child + if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { + return -1; + } + if (dup2(fd[WRITE], STDERR_FILENO) == -1) { + return -1; + } + close(fd[READ]); + return execvp(args[0], (char **)args); + } else { // parent + close(fd[WRITE]); + + int bytes_read = read(fd[READ], buf, buf_size); + if (bytes_read > 0) { + buf[bytes_read - 1] = '\0'; + } + + // Clear out any of the remaining output in the pipe in case the buffer wasn't large enough + struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP }; + while (true) { + int poll_status = poll(&read_poll, 1, -1); + if (poll_status == -1) { + close(fd[READ]); + pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n"); + return -1; + } + if (read_poll.revents & POLLHUP) { + break; + } + // POLLIN, read into scratch buffer to flush things out + char scratch[64]; + bytes_read = read(fd[READ], scratch, sizeof(scratch)); + } + + int status; + if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) { + pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n"); + close(fd[READ]); + return -1; + } + + close(fd[READ]); + + return WEXITSTATUS(status); + } +} + +static bool cuda_checkpoint_supports_flag(const char *flag) +{ + char msg_buf[2048]; + const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; + int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)); + if (ret != 0) { + pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n"); + return false; + } + + if (strstr(msg_buf, flag) == NULL) { + return false; + } + + return true; +} + +/* Retrieve the cuda restore thread TID from the root pid */ +static int get_cuda_restore_tid(int root_pid) +{ + char pid_buf[16]; + char pid_out[CUDA_CKPT_BUF_SIZE]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid); + + const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL }; + int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out)); + if (ret != 0) { + pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out); + return -1; + } + + return atoi(pid_out); +} + +static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, + int buf_size) +{ + char pid_buf[16]; + char timeout_buf[16]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */, + NULL /* timeout_val */, NULL }; + if (timeout > 0) { + snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout); + args[5] = "--timeout"; + args[6] = timeout_buf; + } + + return launch_cuda_checkpoint(args, msg_buf, buf_size); +} + +static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset) +{ + /* Since we resumed a thread that CRIU previously already froze we need to + * INTERRUPT it once again, task was already SEIZE'd so we don't need to do + * a compel_interrupt_task() + */ + if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { + pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", + restore_tid); + return -1; + } + + struct proc_status_creds creds; + if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) { + pr_err("compel_wait_task failed after interrupt\n"); + return -1; + } + + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { + pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + return -1; + } + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { + pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + return -1; + } + + return 0; +} + +static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) +{ + k_rtsigset_t block; + + if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { + pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + return -1; + } + + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { + pr_err("Failed to block signals on restore tid %d\n", restore_tid); + return -1; + } + + // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { + pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + return -1; + } + + if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { + pr_err("Could not resume cuda restore tid %d\n", restore_tid); + return -1; + } + + return 0; +} + +int cuda_plugin_checkpoint_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int int_ret; + int status; + k_rtsigset_t save_sigset; + + if (plugin_disabled) { + return 0; + } + + restore_tid = get_cuda_restore_tid(pid); + + /* We can possibly hit a race with cuInit() where we are past the point of + * locking the process but at lock time cuInit() hadn't completed in which + * case cuda-checkpoint will report that we're in an invalid state to + * checkpoint + */ + if (restore_tid == -1) { + pr_info("No need to checkpoint devices on pid %d\n", pid); + return 0; + } + + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); + /* We need to resume the checkpoint thread to prepare the mappings for + * checkpointing + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); + goto interrupt; + } + status = update_checkpointed_pid(&cuda_pids, pid); + if (status) { + pr_err("Failed to track checkpointed pid %d\n", pid); + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); + } + } +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return status != 0 ? status : int_ret; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); + +int cuda_plugin_pause_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + + if (plugin_disabled) { + return 0; + } + + restore_tid = get_cuda_restore_tid(pid); + + if (restore_tid == -1) { + pr_info("no need to pause devices on pid %d\n", pid); + return 0; + } + + pr_info("pausing devices on pid %d\n", pid); + int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); + return -1; + } + if (add_pid_to_buf(&cuda_pids, pid)) { + pr_err("unable to track paused pid %d\n", pid); + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); + } + return -1; + } + + return 0; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) + +int resume_device(int pid, int checkpointed) +{ + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int status; + int ret = 0; + int int_ret; + k_rtsigset_t save_sigset; + + int restore_tid = get_cuda_restore_tid(pid); + if (restore_tid == -1) { + pr_info("No need to resume devices on pid %d\n", pid); + return 0; + } + + pr_info("resuming devices on pid %d\n", pid); + /* The resuming process has to stay frozen during this time otherwise + * attempting to access a UVM pointer will crash if we haven't restored the + * underlying mappings yet + */ + pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid); + /* wakeup the restore thread so we can handle the restore for this pid, + * rseq_cs has to be restored before execution + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + + if (checkpointed) { + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); + ret = -1; + goto interrupt; + } + } + + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); + ret = -1; + } + +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return ret != 0 ? ret : int_ret; +} + +int cuda_plugin_resume_devices_late(int pid) +{ + if (plugin_disabled) { + return 0; + } + + return resume_device(pid, 1); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) + +int cuda_plugin_init(int stage) +{ + if (!cuda_checkpoint_supports_flag("--action")) { + pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); + plugin_disabled = true; + return 0; + } + + pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage); + + /* In the DUMP stage track all the PID's we've paused CUDA operations on to + * release them when we're done if the user requested the leave-running option + */ + if (stage == CR_PLUGIN_STAGE__DUMP) { + INIT_LIST_HEAD(&cuda_pids); + } + + return 0; +} + +void cuda_plugin_fini(int stage, int ret) +{ + if (plugin_disabled) { + return; + } + + pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret); + + /* Release all the paused PID's at the end of the DUMP stage in case the + * user provides the -R (leave-running) flag or an error occurred + */ + if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { + struct pid_info *info; + list_for_each_entry(info, &cuda_pids, list) { + resume_device(info->pid, info->checkpointed); + } + } + if (stage == CR_PLUGIN_STAGE__DUMP) { + dealloc_pid_buffer(&cuda_pids); + } +} +CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini) From a11e944996d4a71ed275cc12bd6eefa795015100 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Jun 2024 14:56:25 +0200 Subject: [PATCH 261/321] compel: fix build on Amazon Linux 2 due to missing PTRACE_ARCH_PRCTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit fc683cb01 ("compel: shstk: save CET state when CPU supports it") started using PTRACE_ARCH_PRCTL to query shadow stack status. While PTRACE_ARCH_PRCTL has existed in the kernel for a long time, it was only added to glibc in version 2.27. Amazon Linux 2 (AL2) has glibc 2.26, which does not have this definition. As a result, build on AL2 fails with the below error: compel/arch/x86/src/lib/infect.c: In function ‘get_task_xsave’: compel/arch/x86/src/lib/infect.c:276:14: error: ‘PTRACE_ARCH_PRCTL’ undeclared (first use in this function) 276 | if (ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long)&features, ARCH_SHSTK_STATUS)) { | ^~~~~~~~~~~~~~~~~ While the definition is present on the system via the kernel headers (in asm/ptrace-abi.h) which can be reached by including linux/ptrace.h, the comment in compel/include/uapi/ptrace.h says: We'd want to include both sys/ptrace.h and linux/ptrace.h, hoping that most definitions come from either one or another. Alas, on Alpine/musl both files declare struct ptrace_peeksiginfo_args, so there is no way they can be used together. Let's rely on libc one. Since including linux/ptrace.h is not an option, define PTRACE_ARCH_PRCTL if it doesn't already exist. An interesting point to note is that in sys/ptrace.h, PTRACE_ARCH_PRCTL is an enum value so the preprocessor doesn't know about it. PT_ARCH_PRCTL is the preprocessor symbol that matches the value of PTRACE_ARCH_PRCTL. So look for PT_ARCH_PRCTL to decide if PTRACE_ARCH_PRCTL is available or not. Another interesting point to note is that AL2 ships with GCC 7 by default, which does not support the -mshstk option, causing other build failures. Luckily, it also ships GCC 10 which does have the option. Using GCC 10 lets the build succeed. Fixes: fc683cb01 ("compel: shstk: save CET state when CPU supports it") Signed-off-by: Pratyush Yadav --- compel/include/uapi/ptrace.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h index 63dfee97fd..558124fbd6 100644 --- a/compel/include/uapi/ptrace.h +++ b/compel/include/uapi/ptrace.h @@ -86,6 +86,19 @@ struct __ptrace_rseq_configuration { #define PTRACE_EVENT_STOP 128 #endif +/* + * Amazon Linux 2 uses glibc 2.26. PTRACE_ARCH_PRCTL was added in glibc 2.27. + * This allows CRIU to build on Amazon Linux 2. + * + * Note that in sys/ptrace.h, PTRACE_ARCH_PRCTL is an enum value so the + * preprocessor doesn't know about it. PT_ARCH_PRCTL is the preprocessor symbol + * that matches the value of PTRACE_ARCH_PRCTL. So look for PT_ARCH_PRCTL to + * decide if PTRACE_ARCH_PRCTL is available or not. + */ +#if defined(__x86_64__) && !defined(PT_ARCH_PRCTL) +#define PTRACE_ARCH_PRCTL 30 /* From asm/ptrace-abi.h. */ +#endif + extern int ptrace_suspend_seccomp(pid_t pid); extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); From 0a725b899f8d1b2088624e9bf44b1a2eae78b949 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 2 Jul 2024 00:39:53 -0700 Subject: [PATCH 262/321] plugins/cuda: fix crosscompilation Signed-off-by: Andrei Vagin --- plugins/cuda/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile index 2eabc0e314..e337056dc1 100644 --- a/plugins/cuda/Makefile +++ b/plugins/cuda/Makefile @@ -10,7 +10,6 @@ PLUGIN_INCLUDE += -iquote../../ COMPEL := ../../compel/compel-host -CC := gcc PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC __nmk_dir ?= ../../scripts/nmk/scripts/ From fac8d640aa040624d9c371ec2be1df32fb57f42e Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Thu, 30 May 2024 19:52:54 +0800 Subject: [PATCH 263/321] irmap: duplicate string in irmap_scan_path_add Duplicate string in irmap_scan_path_add, otherwise it will free before parsing next configuration input. [ avagin: handle errors of xstrdup ] Signed-off-by: Liu Hua Signed-off-by: Andrei Vagin --- criu/irmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/criu/irmap.c b/criu/irmap.c index 37d098db11..d2c5d588a2 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -500,7 +500,12 @@ int irmap_scan_path_add(char *path) return -1; } - o->ir->path = path; + o->ir->path = xstrdup(path); + if (!o->ir->path) { + xfree(o->ir); + xfree(o); + return -1; + } o->ir->nr_kids = -1; list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; From c2f101a2c962804042b0b9d83489f92b8675d377 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 2 Jul 2024 11:56:41 +0800 Subject: [PATCH 264/321] cgroupd: unblock SIGTERM to make stop_cgroupd actually work Sometimes due to sigblockmask inheritance cgroupd can inherit SIGTERM blocked. That will lead cgroupd ignoring SIGTERM from stop_cgroupd() and CRIU will get stuck due to waiting for never-stopping cgroupd. I see this happening in lxc-checkpoint, also saw this in OpenVZ jenkins on cgroup_inotify00 test. Signed-off-by: Pavel Tikhomirov --- criu/cgroup.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/criu/cgroup.c b/criu/cgroup.c index 6d1f74457d..d90b70bb79 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1947,6 +1947,21 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } +static int cgroupd_unblock_sigterm(void) +{ + sigset_t unblockmask; + + sigemptyset(&unblockmask); + sigaddset(&unblockmask, SIGTERM); + + if (sigprocmask(SIG_UNBLOCK, &unblockmask, NULL)) { + pr_perror("cgroupd: can't unblock SIGTERM"); + return -1; + } + + return 0; +} + /* * If a thread is a different cgroup set than the main thread in process, * it means it is in a threaded controller. This daemon receives the cg_set @@ -1955,6 +1970,14 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) */ static int cgroupd(int sk) { + /* + * This pairs with SIGTERM in stop_cgroupd(), and ensures that cgroupd + * will receive termination signal, regardless of which signal block + * mask was inherited. + */ + if (cgroupd_unblock_sigterm()) + return -1; + pr_info("cgroud: Daemon started\n"); while (1) { From c6c83f1fec574ed15a7200ddbaeb9bf3092293c5 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 2 Jul 2024 23:30:31 -0700 Subject: [PATCH 265/321] apparmor: get_suspend_policy must return NULL in error cases Before this fix, it could return MAP_FAILED which is ((void *) -1). Signed-off-by: Andrei Vagin --- criu/apparmor.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/apparmor.c b/criu/apparmor.c index e46e239f59..48b639216a 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -469,6 +469,7 @@ static void *get_suspend_policy(char *name, off_t *len) ret = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (ret == MAP_FAILED) { pr_perror("mmap of %s failed", file); + ret = NULL; goto out; } From 6f92787b7f38c13523b959477697fa6b12a383ca Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 3 Jul 2024 09:26:28 -0700 Subject: [PATCH 266/321] vdso: proxify the __vdso_clock_gettime64 function It was added in v5.3-rc1~211^2~4^2~10. Fixes #2390 Signed-off-by: Andrei Vagin --- criu/arch/x86/include/asm/vdso.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h index 3b3f292bde..ca46374a55 100644 --- a/criu/arch/x86/include/asm/vdso.h +++ b/criu/arch/x86/include/asm/vdso.h @@ -12,7 +12,7 @@ * This is a minimal amount of symbols * we should support at the moment. */ -#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_MAX 7 #define VDSO_SYMBOL_GTOD 2 /* @@ -42,11 +42,12 @@ const char *aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ const char *aarch_vdso_symbol4 = "__vdso_time"; \ const char *aarch_vdso_symbol5 = "__kernel_sigreturn"; \ - const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; + const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; \ + const char *aarch_vdso_symbol7 = "__vdso_clock_gettime64"; \ #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ - aarch_vdso_symbol6 + aarch_vdso_symbol6, aarch_vdso_symbol7 /* "__kernel_vsyscall", */ From 116c689da324ebce110871642010b5c7f34d836e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 13:24:19 +0100 Subject: [PATCH 267/321] scripts/build: drop centos 7 targets The CI tests with CentOS 7 have been disabled and removed [1,2]. This patch removes the obsolete Makefile targets for these tests. [1] https://github.com/checkpoint-restore/criu/commit/24bc083653f7d2b984653194e921b1ff32292b3b [2] https://github.com/checkpoint-restore/criu/commit/f8466ca798acd124eebbba2655894ebd2f777879 Signed-off-by: Radostin Stoyanov --- scripts/build/Makefile | 2 +- scripts/ci/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 2c006ad873..bc4a59db1c 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,4 +1,4 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide centos7 armv7hf centos8 +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 1caa1e4235..9dc0190b37 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos7 centos8 archlinux +TARGETS := alpine fedora-rawhide centos8 archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME From dcb577b0dcfa8d6ef4bc744d2343fecb18f31d8b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 8 Jul 2024 19:31:38 -0700 Subject: [PATCH 268/321] util: use close_range when it's supported close_range is faster than reading /proc/self/fd and closing descriptors one by one. Signed-off-by: Andrei Vagin --- .../arch/arm/plugins/std/syscalls/syscall.def | 1 + .../mips/plugins/std/syscalls/syscall_64.tbl | 1 + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + .../plugins/std/syscalls/syscall-s390.tbl | 1 + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + criu/include/kerndat.h | 1 + criu/include/util.h | 2 ++ criu/kerndat.c | 25 +++++++++++++++++++ criu/util.c | 14 +++++++++++ 10 files changed, 48 insertions(+) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 217e346a31..9a33009eb0 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -118,6 +118,7 @@ fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) +close_range 436 436 (unsigned int fd, unsigned int max_fd, unsigned int flags) pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 9f50d5e8ad..85faca5a92 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -115,6 +115,7 @@ __NR_fsopen 5430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 5431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 5436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index 4c9b75cf1b..c56b4e6de6 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -114,6 +114,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index af7d550e2c..018d58a590 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -114,6 +114,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index ab36a5cd6f..cc23dc3f35 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -102,6 +102,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 4e843bee9e..7fbfd69ad1 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -113,6 +113,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 41524ed663..e03a573419 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -88,6 +88,7 @@ struct kerndat_s { bool has_membarrier_get_registrations; bool has_pagemap_scan; bool has_shstk; + bool has_close_range; }; extern struct kerndat_s kdat; diff --git a/criu/include/util.h b/criu/include/util.h index 4334e69c2d..9037dc9e62 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -411,4 +411,6 @@ extern void util_init(void); extern char *resolve_mountpoint(char *path); +extern int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/kerndat.c b/criu/kerndat.c index f899ef642c..1a584fe921 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1699,6 +1699,27 @@ static int kerndat_has_membarrier_get_registrations(void) return 0; } +static int kerndat_has_close_range(void) +{ + /* fd is greater than max_fd, so close_range should return EINVAL. */ + if (cr_close_range(2, 1, 0) == 0) { + pr_err("close_range succeeded unexpectedly\n"); + return -1; + } + + if (errno == ENOSYS) { + pr_debug("close_range isn't supported\n"); + return 0; + } + if (errno != EINVAL) { + pr_perror("close_range returned unexpected error code"); + return -1; + } + + kdat.has_close_range = true; + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1956,6 +1977,10 @@ int kerndat_init(void) pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_close_range()) { + pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/util.c b/criu/util.c index 95ba0feda6..d74c2aeefa 100644 --- a/criu/util.c +++ b/criu/util.c @@ -54,6 +54,7 @@ #include "action-scripts.h" #include "compel/infect-util.h" +#include #define VMA_OPT_LEN 128 @@ -518,12 +519,25 @@ int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned return cr_system_userns(in, out, err, cmd, argv, flags, -1); } +int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) +{ + return syscall(__NR_close_range, fd, max_fd, flags); +} + static int close_fds(int minfd) { DIR *dir; struct dirent *de; int fd, ret, dfd; + if (kdat.has_close_range) { + if (cr_close_range(minfd, ~0, 0)) { + pr_perror("close_range failed"); + return -1; + } + return 0; + } + dir = opendir("/proc/self/fd"); if (dir == NULL) { pr_perror("Can't open /proc/self/fd"); From c2f9f900a94cbfefda9385f7dd1b757b226bbaa2 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 12 Jul 2024 23:30:35 +0700 Subject: [PATCH 269/321] zdtm: make cgroup testcases run non-parallel cgroup testcases live in the same cgroup root zdtmtst and zdtmtst.defaultroot controller then create child subgroup for testing. This can cause problems when cgroup testcases run in parallel. For example, testcase A dumps the child subgroup of testcase B since it's in the cgroup root but in the middle of restoring of testcase A, testcase B completes and cleans up the subgroup directory. This causes error in testcase A restore. This commit adds excl flag to all cgroup testcases description so that these don't run parallel. Signed-off-by: Bui Quang Minh --- test/zdtm/static/cgroup00.desc | 2 +- test/zdtm/static/cgroup01.desc | 2 +- test/zdtm/static/cgroup02.desc | 2 +- test/zdtm/static/cgroup_threads.desc | 2 +- test/zdtm/static/cgroup_yard.desc | 2 +- test/zdtm/static/cgroupns.desc | 2 +- test/zdtm/static/cgroupv2_00.desc | 2 +- test/zdtm/static/cgroupv2_01.desc | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/zdtm/static/cgroup00.desc b/test/zdtm/static/cgroup00.desc index 3c6c4a7e22..42a3f2b73a 100644 --- a/test/zdtm/static/cgroup00.desc +++ b/test/zdtm/static/cgroup00.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup01.desc b/test/zdtm/static/cgroup01.desc index 3c6c4a7e22..42a3f2b73a 100644 --- a/test/zdtm/static/cgroup01.desc +++ b/test/zdtm/static/cgroup01.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup02.desc b/test/zdtm/static/cgroup02.desc index df17a57891..eb5a9dd372 100644 --- a/test/zdtm/static/cgroup02.desc +++ b/test/zdtm/static/cgroup02.desc @@ -1,4 +1,4 @@ { 'dopts': '--manage-cgroups --cgroup-root name=zdtmtst:/prefix', - 'flags': 'suid', + 'flags': 'suid excl', 'flavor': 'h', 'ropts': '--manage-cgroups --cgroup-root /newroot --cgroup-root name=zdtmtst:/prefix'} diff --git a/test/zdtm/static/cgroup_threads.desc b/test/zdtm/static/cgroup_threads.desc index 3c6c4a7e22..42a3f2b73a 100644 --- a/test/zdtm/static/cgroup_threads.desc +++ b/test/zdtm/static/cgroup_threads.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup_yard.desc b/test/zdtm/static/cgroup_yard.desc index 8736d6780d..9ad4a9b578 100644 --- a/test/zdtm/static/cgroup_yard.desc +++ b/test/zdtm/static/cgroup_yard.desc @@ -1,6 +1,6 @@ { 'flavor': 'h', -'flags': 'suid', +'flags': 'suid excl', # We create the external cgroup yard in working directory during --pre-dump # hook. We have to go up a few directories to find the yard. 'opts': '--manage-cgroups --cgroup-yard ../../../../../../external_yard' diff --git a/test/zdtm/static/cgroupns.desc b/test/zdtm/static/cgroupns.desc index 80dd710e17..dc61e36cff 100644 --- a/test/zdtm/static/cgroupns.desc +++ b/test/zdtm/static/cgroupns.desc @@ -1,4 +1,4 @@ { 'feature': 'cgroupns', - 'flags': 'suid', + 'flags': 'suid excl', 'flavor': 'h', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroupv2_00.desc b/test/zdtm/static/cgroupv2_00.desc index 4bfd4b2656..e70c84df81 100644 --- a/test/zdtm/static/cgroupv2_00.desc +++ b/test/zdtm/static/cgroupv2_00.desc @@ -1 +1 @@ -{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} +{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_01.desc b/test/zdtm/static/cgroupv2_01.desc index 4bfd4b2656..e70c84df81 100644 --- a/test/zdtm/static/cgroupv2_01.desc +++ b/test/zdtm/static/cgroupv2_01.desc @@ -1 +1 @@ -{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} +{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} From 5c3f6217f44acf1edec57a8abf4edcb624accd6f Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Wed, 10 Jul 2024 18:34:50 +0200 Subject: [PATCH 270/321] Adjust to glibc __rseq_size semantic change In commit 2e456ccf0c34a056e3ccafac4a0c7effef14d918 ("Linux: Make __rseq_size useful for feature detection (bug 31965)") glibc 2.40 changed the meaning of __rseq_size slightly: it is now the size of the active/feature area (20 bytes initially), and not the size of the entire initially defined struct (32 bytes including padding). The reason for the change is that the size including padding does not allow detection of newly added features while previously unused padding is consumed. The prep_libc_rseq_info change in criu/cr-restore.c is not necessary on kernels which have full ptrace support for obtaining rseq information because the code is not used. On older kernels, it is a correctness fix because with size 20 (the new value), rseq registeration would fail. The two other changes are required to make rseq unregistration work in tests. Signed-off-by: Florian Weimer --- criu/cr-restore.c | 8 ++++++++ test/zdtm/static/rseq00.c | 5 ++++- test/zdtm/transition/rseq01.c | 5 ++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4db2f4ecfc..b95d4f134b 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2618,7 +2618,15 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) if (!kdat.has_ptrace_get_rseq_conf) { #if defined(__GLIBC__) && defined(RSEQ_SIG) rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); + /* + * Current glibc reports the feature/active size in + * __rseq_size, not the size passed to the kernel. + * This could be 20, but older kernels expect 32 for + * the size argument even if only 20 bytes are used. + */ rseq->rseq_abi_size = __rseq_size; + if (rseq->rseq_abi_size < 32) + rseq->rseq_abi_size = 32; rseq->signature = RSEQ_SIG; #else rseq->rseq_abi_pointer = 0; diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c index 471ad6a43f..7add7801eb 100644 --- a/test/zdtm/static/rseq00.c +++ b/test/zdtm/static/rseq00.c @@ -46,12 +46,15 @@ static inline void *__criu_thread_pointer(void) static inline void unregister_glibc_rseq(void) { struct rseq *rseq = (struct rseq *)((char *)__criu_thread_pointer() + __rseq_offset); + unsigned int size = __rseq_size; /* hack: mark glibc rseq structure as failed to register */ rseq->cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED; /* unregister rseq */ - syscall(__NR_rseq, (void *)rseq, __rseq_size, 1, RSEQ_SIG); + if (__rseq_size < 32) + size = 32; + syscall(__NR_rseq, (void *)rseq, size, 1, RSEQ_SIG); } #else static inline void unregister_glibc_rseq(void) diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c index 0fbcc2dca0..08a7a8e1a6 100644 --- a/test/zdtm/transition/rseq01.c +++ b/test/zdtm/transition/rseq01.c @@ -33,7 +33,10 @@ static inline void *thread_pointer(void) static inline void unregister_old_rseq(void) { /* unregister rseq */ - syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); + unsigned int size = __rseq_size; + if (__rseq_size < 32) + size = 32; + syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), size, 1, RSEQ_SIG); } #else static inline void unregister_old_rseq(void) From 1b3ba301f9874eb20addf65304283f717ef8d515 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 9 Jul 2024 12:22:25 +0100 Subject: [PATCH 271/321] docs: update amdgpu-plugin man page This patch updates the dependencies section of the AMDGPU plugin man page to reflect that the plugin has been merged upstream and to fix a formatting issue. Signed-off-by: Radostin Stoyanov --- Documentation/criu-amdgpu-plugin.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 35321a9159..68803f3dbc 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -27,14 +27,10 @@ to criu to allow Checkpoint / Restore with ROCm. Dependencies -~~~~~~~~~~~~~~ +------------ *amdkfd support*:: In order to snapshot the *VRAM* and other *GPU* device states, we require - an updated version of amdkfd(amdgpu) driver. The kernel patches are under - review currently. - -*criu 3.16*:: - This work is rebased on latest criu release available at this time. + an updated version of amdkfd(amdgpu) driver. OPTIONS ------- From 8b04dd661f222a5a348a1cd20114b2f5124a4359 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 10 Jul 2024 03:30:34 +0100 Subject: [PATCH 272/321] plugins: set executable bit on .so files For historical reasons, some tools like rpm [1] or ldd [2,3] may expect the executable bit to be present for the correct identification of shared libraries. The executable bit on .so files is set by default by compilers (e.g., GCC). It is not strictly necessary but primarily a convention. [1] https://docs.fedoraproject.org/en-US/package-maintainers/CommonRpmlintIssues/#unstripped_binary_or_object [2] https://sourceware.org/git/?p=glibc.git;a=blob;f=elf/ldd.bash.in;h=d6b640df;hb=HEAD#l154 [3] $ sudo ldd /usr/lib/criu/*.so /usr/lib/criu/amdgpu_plugin.so: ldd: warning: you do not have execution permission for `/usr/lib/criu/amdgpu_plugin.so' linux-vdso.so.1 (0x00007fd0a2a3e000) libdrm.so.2 => /lib64/libdrm.so.2 (0x00007fd0a29eb000) libdrm_amdgpu.so.1 => /lib64/libdrm_amdgpu.so.1 (0x00007fd0a29de000) libc.so.6 => /lib64/libc.so.6 (0x00007fd0a27fc000) /lib64/ld-linux-x86-64.so.2 (0x00007fd0a2a40000) /usr/lib/criu/cuda_plugin.so: ldd: warning: you do not have execution permission for `/usr/lib/criu/cuda_plugin.so' linux-vdso.so.1 (0x00007f1806e13000) libc.so.6 => /lib64/libc.so.6 (0x00007f1806c08000) /lib64/ld-linux-x86-64.so.2 (0x00007f1806e15000) Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 2 +- plugins/cuda/Makefile | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 5efa8fb0ba..6dad001228 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -54,7 +54,7 @@ install: ifeq ($(CONFIG_AMDGPU),y) $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) + $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) endif .PHONY: install diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile index e337056dc1..cc3d98ac9d 100644 --- a/plugins/cuda/Makefile +++ b/plugins/cuda/Makefile @@ -31,11 +31,10 @@ mrproper: clean install: $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) + $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) .PHONY: install uninstall: $(E) " UNINSTALL" $(PLUGIN_NAME) $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) .PHONY: uninstall - From 93746eb25014f21287f2edc5f5dd7545ce19ab52 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 20 Jul 2024 21:14:23 -0700 Subject: [PATCH 273/321] test/zdtm: mount a new tmpfs to the zdtm root /dev The current file system can be mounted with nodev. Fixes #2441 Signed-off-by: Andrei Vagin --- test/others/mnt-ext-dev/run.sh | 2 -- test/zdtm.py | 62 ++++++++++++++++++++++++++-------- test/zdtm/lib/ns.c | 15 +++++++- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/test/others/mnt-ext-dev/run.sh b/test/others/mnt-ext-dev/run.sh index 3f6163e084..5cdbc45a82 100755 --- a/test/others/mnt-ext-dev/run.sh +++ b/test/others/mnt-ext-dev/run.sh @@ -8,8 +8,6 @@ truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop mkfs.ext4 -F zdtm.loop dev=`losetup --find --show zdtm.loop` -mkdir -p ../../dev -cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev python3 ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev diff --git a/test/zdtm.py b/test/zdtm.py index df23ea03d9..102f384c08 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -74,7 +74,10 @@ def traceit(f, e, a): def clean_tests_root(): global tests_root if tests_root and tests_root[0] == os.getpid(): + subprocess.call(["./umount2", os.path.join(tests_root[1], "dev")]) + os.rmdir(os.path.join(tests_root[1], "root/root")) os.rmdir(os.path.join(tests_root[1], "root")) + os.rmdir(os.path.join(tests_root[1], "dev")) os.rmdir(tests_root[1]) @@ -85,8 +88,18 @@ def make_tests_root(): tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", tmpdir)) atexit.register(clean_tests_root) os.mkdir(os.path.join(tests_root[1], "root")) - os.chmod(tests_root[1], 0o777) - return os.path.join(tests_root[1], "root") + os.mkdir(os.path.join(tests_root[1], "root", "root")) + # The current file system can be mounted with nodev, so let's create a + # new tmpfs mount for /dev. + devpath = os.path.join(tests_root[1], "dev") + os.mkdir(devpath) + # zdtm wants to create files on this mount. User namespace tests are + # running with custom user and group mappings. + subprocess.check_call(["mount", "-t", "tmpfs", "criu-test-dev", devpath]) + os.chmod(devpath, 0o777) + os.chmod(tests_root[1], 0o755) + os.chmod(os.path.join(tests_root[1], "root"), 0o755) + return os.path.join(tests_root[1], "root", "root"), os.path.join(tests_root[1], "dev") # Report generation @@ -182,15 +195,16 @@ def clean(): class ns_flavor: __root_dirs = [ - "/bin", "/sbin", "/etc", "/lib", "/lib64", "/dev", "/dev/pts", - "/dev/net", "/tmp", "/usr", "/proc", "/run" + "/bin", "/sbin", "/etc", "/lib", "/lib64", "/dev", + "/tmp", "/usr", "/proc", "/run" ] + __dev_dirs = ["pts", "net"] def __init__(self, opts): self.name = "ns" self.ns = True self.uns = False - self.root = make_tests_root() + self.root, self.devpath = make_tests_root() self.root_mounted = False def __copy_one(self, fname): @@ -236,16 +250,19 @@ def __copy_libs(self, binary): self.__copy_one(lib) def __mknod(self, name, rdev=None): - name = "/dev/" + name + tdev = stat.S_IFCHR if not rdev: - if not os.access(name, os.F_OK): + if not os.access(os.path.join("/dev", name), os.F_OK): print("Skipping %s at root" % name) return else: - rdev = os.stat(name).st_rdev + s = os.stat(os.path.join("/dev", name)) + rdev = s.st_rdev + if stat.S_ISBLK(s.st_mode): + tdev = stat.S_IFBLK - name = self.root + name - os.mknod(name, stat.S_IFCHR, rdev) + name = os.path.join(self.devpath, name) + os.mknod(name, tdev, rdev) os.chmod(name, 0o666) def __construct_root(self): @@ -256,11 +273,18 @@ def __construct_root(self): for ldir in ["/bin", "/sbin", "/lib", "/lib64"]: os.symlink(".." + ldir, self.root + "/usr" + ldir) + def __construct_dev(self): + for dir in self.__dev_dirs: + os.mkdir(os.path.join(self.devpath, dir)) + os.chmod(os.path.join(self.devpath, dir), 0o755) self.__mknod("tty", os.makedev(5, 0)) self.__mknod("null", os.makedev(1, 3)) self.__mknod("net/tun") self.__mknod("rtc") self.__mknod("autofs", os.makedev(10, 235)) + ext_dev = os.getenv("ZDTM_MNT_EXT_DEV") + if ext_dev: + self.__mknod(os.path.basename(ext_dev)) def __copy_deps(self, deps): for d in deps.split('|'): @@ -283,6 +307,9 @@ def init(self, l_bins, x_bins): self.__construct_root() os.mknod(self.root + "/.constructed", stat.S_IFREG | 0o600) + if not os.access(self.devpath + "/.constructed", os.F_OK): + self.__construct_dev() + os.mknod(self.devpath + "/.constructed", stat.S_IFREG | 0o600) for b in l_bins: self.__copy_libs(b) for b in x_bins: @@ -480,6 +507,7 @@ def start(self): if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root + env['ZDTM_DEV'] = self.__flavor.devpath env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" if self.__flavor.uns: @@ -587,12 +615,18 @@ def __getcropts(self): return opts def getdopts(self): - return self.__getcropts() + self.__freezer.getdopts( - ) + self.__desc.get('dopts', '').split() + opts = self.__getcropts() + self.__freezer.getdopts() + \ + self.__desc.get('dopts', '').split() + if self.__flavor.ns: + opts += ["--external", "mnt[/dev]:ZDTM_DEV"] + return opts def getropts(self): - return self.__getcropts() + self.__freezer.getropts( - ) + self.__desc.get('ropts', '').split() + opts = self.__getcropts() + self.__freezer.getropts() + \ + self.__desc.get('ropts', '').split() + if self.__flavor.ns: + opts += ["--external", "mnt[ZDTM_DEV]:%s" % self.__flavor.devpath] + return opts def unlink_pidfile(self): self.__pid = 0 diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 6f6cccc992..205938d20b 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -27,7 +27,7 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path; + char *root, *criu_path, *dev_path; char path[PATH_MAX]; root = getenv("ZDTM_ROOT"); @@ -51,6 +51,19 @@ static int prepare_mntns(void) return -1; } + dev_path = getenv("ZDTM_DEV"); + if (dev_path) { + snprintf(path, sizeof(path), "%s/dev", root); + if (mount(dev_path, path, NULL, MS_BIND, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + if (mount(NULL, path, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + } + criu_path = getenv("ZDTM_CRIU"); if (criu_path) { snprintf(path, sizeof(path), "%s%s", root, criu_path); From 7a274270316c71051af673bbf01acdc53ba0f168 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 31 Jul 2024 23:32:30 +0200 Subject: [PATCH 274/321] seize: fix pause-devices plugin hook The plugin hook "PAUSE_DEVICES" was recently introduced in the following commit. This hook was intended to execute the cuda-checkpoint tool before the process tree is frozen. However, the run_plugins() call has been placed immediately *after* freeze_processes(). This causes the cuda-checkpoint tool to hang indefinitely during the checkpointing of CUDA applications running in containers, eventually leading to its termination by the timeout alarm. a85f488595e0a3a6e6cc6ca7c94d4a00b1341aaf criu/plugin: Introduce new plugin hooks PAUSE_DEVICES and CHECKPOINT_DEVICES to be used during pstree collection This problem can be reproduced with the following example: sudo podman run -d --rm \ --device nvidia.com/gpu=all --security-opt=label=disable \ quay.io/radostin/cuda-counter sudo podman container checkpoint -l -e /tmp/checkpoint.tar Signed-off-by: Radostin Stoyanov --- criu/seize.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index d392259bc5..ae270022f7 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -983,6 +983,11 @@ int collect_pstree(void) */ alarm(opts.timeout); + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (opts.freeze_cgroup && cgroup_version()) goto err; @@ -991,11 +996,6 @@ int collect_pstree(void) if (opts.freeze_cgroup && freeze_processes()) goto err; - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; From c4ba553d2e963c3578daaae46ad9b494b2c0d822 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 24 Jul 2024 11:30:59 +0100 Subject: [PATCH 275/321] plugin: enable multiple plugins for the same hook CRIU provides two plugins for checkpoint/restore of GPU applications: amdgpu and cuda. Both plugins use the `RESUME_DEVICES_LATE` hook to enable restore: CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) However, CRIU currently does not support running more than one plugin for the same hook. As a result, when both plugins are installed, the resume function for CUDA applications is not executed. To fix this, we need to make sure that both `plugin_resume_devices_late()` functions return `-ENOTSUP` when restore is not supported. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 3 ++- plugins/cuda/cuda_plugin.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index a41469a509..b73b5101db 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1809,7 +1809,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); if (fd < 0) { pr_perror("failed to open kfd in plugin"); - return -1; + return -ENOTSUP; } args.pid = target_pid; @@ -1818,6 +1818,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { if (errno == ESRCH) { pr_info("Pid %d has no kfd process info\n", target_pid); + exit_code = -ENOTSUP; } else { pr_perror("restore late ioctl failed"); exit_code = -1; diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index b3f2fc8df7..f16c4c5051 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -408,7 +408,7 @@ int resume_device(int pid, int checkpointed) int cuda_plugin_resume_devices_late(int pid) { if (plugin_disabled) { - return 0; + return -ENOTSUP; } return resume_device(pid, 1); From b7f6b723d994811e52c4a9c25639c09e3cfe9585 Mon Sep 17 00:00:00 2001 From: liuchao173 Date: Thu, 8 Aug 2024 20:55:42 +0800 Subject: [PATCH 276/321] delete redundant include header files restorer.h has been included in line 43. Fixes: 22963d282729 ("Hide asm/restorer.h from sources") Signed-off-by: liuchao173 --- criu/pie/restorer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 7c34c06d47..51ed6ed4c8 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -49,7 +49,6 @@ #include "images/inventory.pb-c.h" #include "shmem.h" -#include "restorer.h" /* * sys_getgroups() buffer size. Not too much, to avoid stack overflow. From 883d442f9ed12c403dc3036822f7d14b93bd18e9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 16:04:36 +0100 Subject: [PATCH 277/321] ci/podman: show criu logs in case of error Signed-off-by: Radostin Stoyanov --- scripts/ci/podman-test.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 72ad59a501..3198589f58 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -24,6 +24,9 @@ podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' +# Show criu logs in case of error +trap 'cat /var/lib/containers/storage/overlay-containers/*/userdata/*.log' EXIT + sleep 1 for i in $(seq 20); do echo "Test $i for podman container checkpoint" @@ -64,3 +67,5 @@ for i in $(seq 20); do podman ps -a rm -f /tmp/chkpt.tar.gz done + +trap 'echo PASS' EXIT \ No newline at end of file From 756a7aa639ca36ca61f5817e7a86cadcdbf83905 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 16:07:59 +0100 Subject: [PATCH 278/321] ci/podman: show mounts Show information about mounts available on the host filesystem. This is useful for debugging. Signed-off-by: Radostin Stoyanov --- scripts/ci/podman-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 3198589f58..185783011d 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -20,6 +20,7 @@ sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers. # Test checkpoint/restore with action script echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf +cat /proc/self/mountinfo podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' From 208f60f04638d44594dfe3845e6a2aec82e16e1a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 8 Jul 2024 16:53:39 +0100 Subject: [PATCH 279/321] cuda: don't leak fds to cuda-checkpoint Leaking open file descriptors to third-party tools can lead to security risks. Signed-off-by: Radostin Stoyanov --- criu/include/util.h | 1 + criu/util.c | 2 +- plugins/cuda/cuda_plugin.c | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/criu/include/util.h b/criu/include/util.h index 9037dc9e62..435469e1ec 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -170,6 +170,7 @@ extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); extern int cr_daemon(int nochdir, int noclose, int close_fd); extern int status_ready(void); extern int is_root_user(void); +extern int close_fds(int minfd); extern int set_proc_self_fd(int fd); diff --git a/criu/util.c b/criu/util.c index d74c2aeefa..7dfa1fe424 100644 --- a/criu/util.c +++ b/criu/util.c @@ -524,7 +524,7 @@ int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) return syscall(__NR_close_range, fd, max_fd, flags); } -static int close_fds(int minfd) +int close_fds(int minfd) { DIR *dir; struct dirent *de; diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index f16c4c5051..e44b4d007f 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -115,7 +115,9 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) if (dup2(fd[WRITE], STDERR_FILENO) == -1) { return -1; } - close(fd[READ]); + + close_fds(STDERR_FILENO + 1); + return execvp(args[0], (char **)args); } else { // parent close(fd[WRITE]); From b83f131c12b0a1578df32bf58245d33fe2c39e8a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 18:48:30 +0100 Subject: [PATCH 280/321] cuda: fix launch cuda-checkpoint When the cuda-checkpoint tool is not installed, execvp() is expected to fail and return -1. In this case, we need to call exit() to terminate the child process that was created earlier with fork(). Since CRIU can be used with applications that do not use CUDA, even when the CUDA plugin is installed, this patch also updates the log messages to show debug and warning (instead of error) when the cuda-checkpoint tool is not found in $PATH. Signed-off-by: Radostin Stoyanov Signed-off-by: Andrei Vagin --- plugins/cuda/cuda_plugin.c | 133 ++++++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 45 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index e44b4d007f..39c78e370b 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -45,7 +45,7 @@ struct pid_info { /* Used to track which PID's we've paused CUDA operations on so far so we can * release them after we're done with the DUMP */ -struct list_head cuda_pids; +static LIST_HEAD(cuda_pids); static void dealloc_pid_buffer(struct list_head *pid_buf) { @@ -91,7 +91,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) { #define READ 0 #define WRITE 1 - int fd[2]; + int fd[2], buf_off; if (pipe(fd) != 0) { pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); @@ -110,68 +110,103 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) if (child_pid == 0) { // child if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { - return -1; + pr_perror("unable to clone fd %d->%d", fd[WRITE], STDOUT_FILENO); + _exit(EXIT_FAILURE); } if (dup2(fd[WRITE], STDERR_FILENO) == -1) { - return -1; + pr_perror("unable to clone fd %d->%d", fd[WRITE], STDERR_FILENO); + _exit(EXIT_FAILURE); } + close(fd[READ]); close_fds(STDERR_FILENO + 1); - return execvp(args[0], (char **)args); - } else { // parent - close(fd[WRITE]); + execvp(args[0], (char **)args); - int bytes_read = read(fd[READ], buf, buf_size); - if (bytes_read > 0) { - buf[bytes_read - 1] = '\0'; - } + /* We can't use pr_error() as log file fd is closed. */ + fprintf(stderr, "execvp(\"%s\") failed: %s\n", args[0], strerror(errno)); - // Clear out any of the remaining output in the pipe in case the buffer wasn't large enough - struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP }; - while (true) { - int poll_status = poll(&read_poll, 1, -1); - if (poll_status == -1) { - close(fd[READ]); - pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n"); - return -1; - } - if (read_poll.revents & POLLHUP) { - break; - } - // POLLIN, read into scratch buffer to flush things out - char scratch[64]; - bytes_read = read(fd[READ], scratch, sizeof(scratch)); - } + _exit(EXIT_FAILURE); + } - int status; - if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) { - pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n"); - close(fd[READ]); - return -1; + close(fd[WRITE]); + buf_off = 0; + /* Reserve one byte for the null charracter. */ + buf_size--; + while (buf_off < buf_size) { + int bytes_read; + bytes_read = read(fd[READ], buf + buf_off, buf_size - buf_off); + if (bytes_read == -1) { + pr_perror("Unable to read output of cuda-checkpoint"); + goto err; + } + if (bytes_read == 0) + break; + buf_off += bytes_read; + } + buf[buf_off] = '\0'; + + /* Clear out any of the remaining output in the pipe in case the buffer wasn't large enough */ + while (true) { + char scratch[1024]; + int bytes_read; + bytes_read = read(fd[READ], scratch, sizeof(scratch)); + if (bytes_read == -1) { + pr_perror("Unable to read output of cuda-checkpoint"); + goto err; } + if (bytes_read == 0) + break; + } + close(fd[READ]); - close(fd[READ]); + int status, exit_code = -1; + if (waitpid(child_pid, &status, 0) == -1) { + pr_perror("Unable to wait for the cuda-checkpoint process %d", child_pid); + goto err; + } + if (WIFSIGNALED(status)) { + int sig = WTERMSIG(status); - return WEXITSTATUS(status); + pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); + } else if (WIFEXITED(status)) { + exit_code = WEXITSTATUS(status); + } else { + pr_err("cuda-checkpoint exited improperly: %u\n", status); } + + if (exit_code != EXIT_SUCCESS) + pr_debug("cuda-checkpoint output ===>\n%s\n" + "<=== cuda-checkpoint output\n", + buf); + + return exit_code; +err: + kill(child_pid, SIGKILL); + waitpid(child_pid, NULL, 0); + return -1; } -static bool cuda_checkpoint_supports_flag(const char *flag) +/** + * Checks if a given flag is supported by the cuda-checkpoint utility + * + * Returns: + * 1 if the flag is supported, + * 0 if the flag is not supported, + * -1 if there was an error launching the cuda-checkpoint utility. + */ +static int cuda_checkpoint_supports_flag(const char *flag) { char msg_buf[2048]; const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; - int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)); - if (ret != 0) { - pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n"); - return false; - } - if (strstr(msg_buf, flag) == NULL) { - return false; - } + if (launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)) != 0) + return -1; + + if (strstr(msg_buf, flag) == NULL) + return 0; - return true; + return 1; } /* Retrieve the cuda restore thread TID from the root pid */ @@ -419,7 +454,15 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_ int cuda_plugin_init(int stage) { - if (!cuda_checkpoint_supports_flag("--action")) { + int ret = cuda_checkpoint_supports_flag("--action"); + + if (ret == -1) { + pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT); + plugin_disabled = true; + return 0; + } + + if (ret == 0) { pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); plugin_disabled = true; return 0; From 0a5dfcf8bd83f36a6a9ec164b0a9ca2fffc1476e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 23:52:42 +0100 Subject: [PATCH 281/321] zdtm: add option to run tests with criu plugins By default, if the "CRIU_LIBS_DIR" environment variable is not set, CRIU will load all plugins installed in `/usr/lib/criu`. This may result in running the ZDTM tests with plugins for a different version of CRIU (e.g., installed from a package). This patch updates ZDTM to always set the "CRIU_LIBS_DIR" environment variable and use a local "plugins" directory. This directory contains copies of the plugin files built from source. In addition, this patch adds the `--criu-plugin` option to the `zdtm.py run` command, allowing tests to be run with specified CRIU plugins. Example: - Run test only with AMDGPU plugin ./zdtm.py run -t zdtm/static/busyloop00 --criu-plugin amdgpu - Run test only with CUDA plugin ./zdtm.py run -t zdtm/static/busyloop00 --criu-plugin cuda - Run test with both AMDGPU and CUDA plugins ./zdtm.py run -t zdtm/static/busyloop00 --criu-plugin amdgpu cuda Signed-off-by: Radostin Stoyanov --- test/plugins/.gitignore | 1 + test/plugins/Makefile | 18 ++++++++++++++++++ test/zdtm.py | 21 ++++++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 test/plugins/.gitignore create mode 100644 test/plugins/Makefile diff --git a/test/plugins/.gitignore b/test/plugins/.gitignore new file mode 100644 index 0000000000..140f8cf80f --- /dev/null +++ b/test/plugins/.gitignore @@ -0,0 +1 @@ +*.so diff --git a/test/plugins/Makefile b/test/plugins/Makefile new file mode 100644 index 0000000000..7827b655c4 --- /dev/null +++ b/test/plugins/Makefile @@ -0,0 +1,18 @@ +SRC_DIR := ../../plugins +PLUGIN_TARGETS := amdgpu_plugin.so cuda_plugin.so + +# Silent make rules. +Q := @ + +all: $(PLUGIN_TARGETS) + +amdgpu_plugin.so: $(SRC_DIR)/amdgpu/amdgpu_plugin.so + $(Q) cp $< $@ + +cuda_plugin.so: $(SRC_DIR)/cuda/cuda_plugin.so + $(Q) cp $< $@ + +clean: + $(Q) $(RM) $(PLUGIN_TARGETS) + +.PHONY: all clean diff --git a/test/zdtm.py b/test/zdtm.py index 102f384c08..87914f740b 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -40,6 +40,12 @@ "libfault.so" ) +# A directory that contains the CRIU plugins. +PLUGINS_DIR = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "plugins" +) + prev_line = None uuid = uuid.uuid4() @@ -672,6 +678,12 @@ def available(): subprocess.check_call(["make", "-C", "zdtm/"]) if 'preload_libfault' in opts and opts['preload_libfault']: subprocess.check_call(["make", "-C", "libfault/"]) + + subprocess.check_call(["make", '--no-print-directory', "-C", "plugins/", "clean"]) + if 'criu_plugin' in opts and opts['criu_plugin']: + for name in opts['criu_plugin']: + subprocess.check_call(["make", '--no-print-directory', "-C", "plugins/", f"{name}_plugin.so"]) + if 'rootless' in opts and opts['rootless']: return subprocess.check_call( @@ -929,7 +941,9 @@ def run(action, timeout=60): env = dict( os.environ, - ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0") + ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0", + CRIU_LIBS_DIR=PLUGINS_DIR + ) if fault: print("Forcing %s fault" % fault) @@ -2852,6 +2866,11 @@ def get_cli_args(): rp.add_argument("--test-shard-count", type=int, default=0, help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") + rp.add_argument("--criu-plugin", + help="Run tests with CRIU plugin", + choices=['amdgpu', 'cuda'], + nargs='+', + default=None) lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) From 919de60cfac8c10af502cf8dd1a7131c756f18c1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 20 Jul 2024 13:17:09 +0100 Subject: [PATCH 282/321] ci: run tests with amdgpu and cuda plugins Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index ef2dffb1a4..950453c0d4 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -354,7 +354,15 @@ make -C test/others/action-script run # compel testing make -C compel/test -# amdgpu_plugin testing +# amdgpu and cuda plugin testing make amdgpu_plugin make -C plugins/amdgpu/ test_topology_remap ./plugins/amdgpu/test_topology_remap + +./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin cuda +./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin amdgpu +./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin amdgpu cuda + +./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin cuda +./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin amdgpu +./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin amdgpu cuda From 8cf9722a1c94b7107b0934426cc352a49955e53e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 25 Jul 2024 09:50:10 +0100 Subject: [PATCH 283/321] timer: fix printf specifiers for __suseconds64_t New internal glibc types __timeval64 [1] and __suseconds64_t [2] have been introduced as a solution for the Y2038 problem [3]. These 64-bit types are used across all architectures. However, this change causes the following build errors when cross-compiling on ARMv7 (armhf): criu/timer.c:49:17: error: format '%ld' expects argument of type 'long int', but argument 5 has type '__suseconds64_t' {aka 'long long int'} [-Werror=format=] 49 | pr_info("Restored %s timer to %" PRId64 ".%ld -> %" PRId64 ".%ld\n", n, | ^~~~~~~~~~~~~~~~~~~~~~~~ 50 | (int64_t)val->it_value.tv_sec, val->it_value.tv_usec, | ~~~~~~~~~~~~~~~~~~~~~ | | | __suseconds64_t {aka long long int} criu/timer.c:49:17: error: format '%ld' expects argument of type 'long int', but argument 7 has type '__suseconds64_t' {aka 'long long int'} [-Werror=format=] 49 | pr_info("Restored %s timer to %" PRId64 ".%ld -> %" PRId64 ".%ld\n", n, | ^~~~~~~~~~~~~~~~~~~~~~~~ 50 | (int64_t)val->it_value.tv_sec, val->it_value.tv_usec, 51 | (int64_t)val->it_interval.tv_sec, val->it_interval.tv_usec); | ~~~~~~~~~~~~~~~~~~~~~~~~ | | | __suseconds64_t {aka long long int} ns.c:234:48: error: format '%ld' expects argument of type 'long int', but argument 5 has type 'time_t' {aka 'long long int'} [-Werror=format=] 234 | len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset); | ~~^ ~~~~~~ | | | | long int time_t {aka long long int} | %lld msg.c:58:41: error: format '%ld' expects argument of type 'long int', but argument 3 has type '__suseconds64_t' {aka 'long long int'} [-Werror=format=] 58 | off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); | ~~~~^ ~~~~~~~~~~~~~~~~~ | | | | long int __suseconds64_t {aka long long int} | %.3lld ../lib/zdtmtst.h:137:26: error: format '%ld' expects argument of type 'long int', but argument 4 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 137 | test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ | ^~~~~~~~~~~~~~ pthread_timers_h.c:72:17: note: in expansion of macro 'pr_perror' 72 | pr_perror("wrong interval: %ld:%ld", itimerspec.it_interval.tv_sec, itimerspec.it_interval.tv_nsec); | ^~~~~~~~~ vdso00.c:22:32: error: format '%li' expects argument of type 'long int', but argument 3 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 22 | test_msg("%d time: %10li\n", getpid(), tv.tv_sec); | ~~~~^ ~~~~~~~~~ | | | | long int __time64_t {aka long long int} | %10lli vdso00.c:29:32: error: format '%li' expects argument of type 'long int', but argument 3 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 29 | test_msg("%d time: %10li\n", getpid(), tv.tv_sec); | ~~~~^ ~~~~~~~~~ | | | | long int __time64_t {aka long long int} | %10lli vdso01.c:357:42: error: format '%li' expects argument of type 'long int', but argument 2 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 357 | test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); | ~~^ ~~~~~~~~~~ | | | | long int __time64_t {aka long long int} | %lli vdso01.c:357:72: error: format '%li' expects argument of type 'long int', but argument 3 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 357 | test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); | ~~^ ~~~~~~~~~~ | | | | long int __time64_t {aka long long int} | vdso01.c:328:43: error: format '%li' expects argument of type 'long int', but argument 2 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 328 | test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); | ~~^ ~~~~~~~~~~ | | | | long int __time64_t {aka long long int} | %lli vdso01.c:328:74: error: format '%li' expects argument of type 'long int', but argument 3 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 328 | test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); | ~~^ ~~~~~~~~~~ | | | | long int __time64_t {aka long long int} | ../lib/zdtmtst.h:144:26: error: format '%ld' expects argument of type 'long int', but argument 4 has type 'time_t' {aka 'long long int'} [-Werror=format=] 144 | test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ | ^~~~~~~~~~~~~~~ mtime_mmap.c:80:17: note: in expansion of macro 'fail' 80 | fail("mtime %ld wasn't updated on mmapped %s file", mtime_new, filename); | ^~~~ ../lib/zdtmtst.h:144:26: error: format '%ld' expects argument of type 'long int', but argument 4 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 144 | test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ | ^~~~~~~~~~~~~~~ mtime_mmap.c:101:17: note: in expansion of macro 'fail' 101 | fail("After migration, mtime changed to %ld", fst.st_mtime); | ^~~~ [1] https://sourceware.org/git/?p=glibc.git;h=504c98717062cb9bcbd4b3e59e932d04331ddca5 [2] https://sourceware.org/git/?p=glibc.git;h=3fced064f23562ec24f8312ffbc14950993969e6 [3] https://en.wikipedia.org/wiki/Year_2038_problem Signed-off-by: Radostin Stoyanov --- criu/timer.c | 6 +++--- test/zdtm/lib/msg.c | 3 ++- test/zdtm/lib/ns.c | 3 ++- test/zdtm/static/mtime_mmap.c | 5 +++-- test/zdtm/static/pthread_timers.c | 4 +++- test/zdtm/static/vdso00.c | 6 +++--- test/zdtm/static/vdso01.c | 7 +++++-- 7 files changed, 21 insertions(+), 13 deletions(-) diff --git a/criu/timer.c b/criu/timer.c index 4b286635de..e94cf0280d 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -46,9 +46,9 @@ static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) return -1; } - pr_info("Restored %s timer to %" PRId64 ".%ld -> %" PRId64 ".%ld\n", n, - (int64_t)val->it_value.tv_sec, val->it_value.tv_usec, - (int64_t)val->it_interval.tv_sec, val->it_interval.tv_usec); + pr_info("Restored %s timer to %" PRId64 ".%" PRId64 " -> %" PRId64 ".%" PRId64 "\n", n, + (int64_t)val->it_value.tv_sec, (int64_t)val->it_value.tv_usec, + (int64_t)val->it_interval.tv_sec, (int64_t)val->it_interval.tv_usec); return 0; } diff --git a/test/zdtm/lib/msg.c b/test/zdtm/lib/msg.c index 1cf92e3e01..9ba1c47a43 100644 --- a/test/zdtm/lib/msg.c +++ b/test/zdtm/lib/msg.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -55,7 +56,7 @@ void test_msg(const char *format, ...) off += strftime(buf, sizeof(buf), "%H:%M:%S", tm); } - off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); + off += sprintf(buf + off, ".%.3" PRId64 ": ", (int64_t)(tv.tv_usec / 1000)); off += sprintf(buf + off, "%5d: ", getpid()); skip: diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 205938d20b..3c0dbdeb80 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -231,7 +232,7 @@ static inline int _settime(clockid_t clk_id, time_t offset) if (clk_id == CLOCK_MONOTONIC_COARSE || clk_id == CLOCK_MONOTONIC_RAW) clk_id = CLOCK_MONOTONIC; - len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset); + len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, (int64_t)offset); fd = open("/proc/self/timens_offsets", O_WRONLY); if (fd < 0) { diff --git a/test/zdtm/static/mtime_mmap.c b/test/zdtm/static/mtime_mmap.c index faa2d6fad7..4de8438ee2 100644 --- a/test/zdtm/static/mtime_mmap.c +++ b/test/zdtm/static/mtime_mmap.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -77,7 +78,7 @@ int main(int argc, char **argv) mtime_new = fst.st_mtime; /* time of last modification */ if (mtime_new <= mtime_old) { - fail("mtime %ld wasn't updated on mmapped %s file", mtime_new, filename); + fail("mtime %" PRId64 " wasn't updated on mmapped %s file", (int64_t)mtime_new, filename); goto failed; } @@ -98,7 +99,7 @@ int main(int argc, char **argv) /* time of last modification */ if (fst.st_mtime != mtime_new) { - fail("After migration, mtime changed to %ld", fst.st_mtime); + fail("After migration, mtime changed to %" PRId64, (int64_t)fst.st_mtime); goto failed; } diff --git a/test/zdtm/static/pthread_timers.c b/test/zdtm/static/pthread_timers.c index 5246a985fd..b1b2a9a23d 100644 --- a/test/zdtm/static/pthread_timers.c +++ b/test/zdtm/static/pthread_timers.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -69,7 +70,8 @@ int main(int argc, char **argv) } if (itimerspec.it_interval.tv_nsec != TEST_INTERVAL_NSEC || itimerspec.it_interval.tv_sec) { - pr_perror("wrong interval: %ld:%ld", itimerspec.it_interval.tv_sec, itimerspec.it_interval.tv_nsec); + pr_perror("wrong interval: %" PRId64 ":%" PRId64, + (int64_t)itimerspec.it_interval.tv_sec, (int64_t)itimerspec.it_interval.tv_nsec); return 1; } diff --git a/test/zdtm/static/vdso00.c b/test/zdtm/static/vdso00.c index a9bef4dbd2..69123a2032 100644 --- a/test/zdtm/static/vdso00.c +++ b/test/zdtm/static/vdso00.c @@ -1,6 +1,6 @@ #include #include - +#include #include #include @@ -19,14 +19,14 @@ int main(int argc, char *argv[]) test_msg("%s pid %d\n", argv[0], getpid()); gettimeofday(&tv, &tz); - test_msg("%d time: %10li\n", getpid(), tv.tv_sec); + test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); test_daemon(); test_waitsig(); /* this call will fail if vDSO is corrupted */ gettimeofday(&tv, &tz); - test_msg("%d time: %10li\n", getpid(), tv.tv_sec); + test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); pass(); diff --git a/test/zdtm/static/vdso01.c b/test/zdtm/static/vdso01.c index 4e33d30a8f..d8b3c94d5e 100644 --- a/test/zdtm/static/vdso01.c +++ b/test/zdtm/static/vdso01.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -324,7 +325,8 @@ static int vdso_clock_gettime_handler(void *func) clock_gettime(CLOCK_REALTIME, &ts1); vdso_clock_gettime(CLOCK_REALTIME, &ts2); - test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); + test_msg("clock_gettime: tv_sec %" PRId64 " vdso_clock_gettime: tv_sec %" PRId64 "\n", + (int64_t)ts1.tv_sec, (int64_t)ts2.tv_sec); if (labs(ts1.tv_sec - ts2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); @@ -354,7 +356,8 @@ static int vdso_gettimeofday_handler(void *func) gettimeofday(&tv1, &tz); vdso_gettimeofday(&tv2, &tz); - test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); + test_msg("gettimeofday: tv_sec %" PRId64 " vdso_gettimeofday: tv_sec %" PRId64 "\n", + (int64_t)tv1.tv_sec, (int64_t)tv2.tv_sec); if (labs(tv1.tv_sec - tv2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); From 6d4eeb75a711625f8b34cfe660bae9f032dd7741 Mon Sep 17 00:00:00 2001 From: haozi007 Date: Tue, 6 Aug 2024 10:57:21 +0800 Subject: [PATCH 284/321] support user set remote mmap vma address 1. os auto assignment vma addr maybe conflict with vma in gpu living migrate scene; 2. so, we should give choice to user; Signed-off-by: haozi007 --- compel/include/uapi/infect.h | 1 + compel/src/lib/infect.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index cd62559097..7e6134f4bc 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -120,6 +120,7 @@ struct infect_ctx { open_proc_fn open_proc; int log_fd; /* fd for parasite code to send messages to */ + unsigned long remote_map_addr; /* User-specified address where to mmap parasitic code, default not set */ }; extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 79d00c9a10..1e3ffb9670 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -816,7 +816,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; - int ret, fd, lfd; + int ret, fd, lfd, remote_flags; if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; @@ -860,7 +860,11 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, goto err_cure; } - ctl->remote_map = remote_mmap(ctl, NULL, size, remote_prot, MAP_FILE | MAP_SHARED, fd, 0); + remote_flags = MAP_FILE | MAP_SHARED; + if (ctl->ictx.remote_map_addr){ + remote_flags |= MAP_FIXED_NOREPLACE; + } + ctl->remote_map = remote_mmap(ctl, (void *)ctl->ictx.remote_map_addr, size, remote_prot, remote_flags, fd, 0); if (!ctl->remote_map) { pr_err("Can't rmap memfd for parasite blob\n"); goto err_curef; From 6f09b4962b183ddd3eb90f6c2bcf12b2adaa9b2d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 16 Aug 2024 07:49:32 -0700 Subject: [PATCH 285/321] test/zdtm: allow to run tests with the mocked cuda-checkpoint tool Here is an example how to run one test: $ python test/zdtm.py run -t zdtm/static/env00 --ignore-taint --mocked-cuda-checkpoint Signed-off-by: Andrei Vagin --- test/cuda-checkpoint/.gitignore | 1 + test/cuda-checkpoint/Makefile | 17 +++++++++ test/cuda-checkpoint/cuda-checkpoint.c | 53 ++++++++++++++++++++++++++ test/zdtm.py | 16 +++++++- 4 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 test/cuda-checkpoint/.gitignore create mode 100644 test/cuda-checkpoint/Makefile create mode 100644 test/cuda-checkpoint/cuda-checkpoint.c diff --git a/test/cuda-checkpoint/.gitignore b/test/cuda-checkpoint/.gitignore new file mode 100644 index 0000000000..717fb70286 --- /dev/null +++ b/test/cuda-checkpoint/.gitignore @@ -0,0 +1 @@ +cuda-checkpoint diff --git a/test/cuda-checkpoint/Makefile b/test/cuda-checkpoint/Makefile new file mode 100644 index 0000000000..c59dadddc7 --- /dev/null +++ b/test/cuda-checkpoint/Makefile @@ -0,0 +1,17 @@ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) + +BIN := cuda-checkpoint +SRC := cuda-checkpoint.c +DEP := $(SRC:%.c=%.d) +OBJ := $(SRC:%.c=%.o) +TARGETS := $(BIN) + +include ../zdtm/Makefile.inc + +all: $(TARGETS) +.PHONY: all + +clean-more: + $(RM) $(TARGETS) +.PHONY: clean-more +clean: clean-more diff --git a/test/cuda-checkpoint/cuda-checkpoint.c b/test/cuda-checkpoint/cuda-checkpoint.c new file mode 100644 index 0000000000..f35a4b41df --- /dev/null +++ b/test/cuda-checkpoint/cuda-checkpoint.c @@ -0,0 +1,53 @@ +/* The mocked version of cuda-checkpoint. */ +#include +#include +#include + +int main(int argc, char *argv[]) +{ + int c; + + while (1) { + int option_index = 0; + static struct option long_options[] = { + { "pid", required_argument, 0, 'p' }, + { "get-restore-tid", no_argument, 0, 'g' }, + { "action", required_argument, 0, 'a' }, + { "timeout", required_argument, 0, 't' }, + { "help", no_argument, 0, 'h' }, + { 0, 0, 0, 0 } + }; + + c = getopt_long(argc, argv, "p:ga:ht:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'p': + printf("%s\n", optarg); + break; + case 'g': + case 'a': + case 't': + break; + case 'h': + printf("--action - execute an action"); + break; + + default: + fprintf(stderr, "getopt returned character code 0%o ??\n", c); + return 1; + } + } + + if (optind < argc) { + fprintf(stderr, "non-option ARGV-elements: "); + while (optind < argc) + fprintf(stderr, "%s ", argv[optind++]); + fprintf(stderr, "\n"); + return 1; + } + + return 0; +} diff --git a/test/zdtm.py b/test/zdtm.py index 87914f740b..6b2132cc30 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -684,6 +684,8 @@ def available(): for name in opts['criu_plugin']: subprocess.check_call(["make", '--no-print-directory', "-C", "plugins/", f"{name}_plugin.so"]) + if 'mocked_cuda_checkpoint' in opts and opts['mocked_cuda_checkpoint']: + subprocess.check_call(["make", "-C", "cuda-checkpoint/"]) if 'rootless' in opts and opts['rootless']: return subprocess.check_call( @@ -1141,6 +1143,7 @@ def __init__(self, opts): self.__pre_dump_mode = opts['pre_dump_mode'] self.__preload_libfault = bool(opts['preload_libfault']) self.__mntns_compat_mode = bool(opts['mntns_compat_mode']) + self.__cuda_checkpoint = bool(opts['mocked_cuda_checkpoint']) if opts['rpc']: self.__criu = criu_rpc @@ -1223,6 +1226,9 @@ def __criu_act(self, action, opts=[], log=None, nowait=False): s_args = ["--log-file", log, "--images-dir", self.__ddir(), "--verbosity=4"] + opts + if self.__cuda_checkpoint: + s_args += [ "--libdir" , os.path.join(os.getcwd(), "..", "plugins", "cuda") ] + with open(os.path.join(self.__ddir(), action + '.cropt'), 'w') as f: f.write(' '.join(s_args) + '\n') @@ -2160,7 +2166,7 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless', 'preload_libfault') + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2173,8 +2179,11 @@ def run_test(self, name, desc, flavor): if opts['rootless'] and os.getuid() == 0: os.setgid(NON_ROOT_UID) os.setuid(NON_ROOT_UID) + env = dict(os.environ, CR_CT_TEST_INFO=arg) + if opts['mocked_cuda_checkpoint']: + env['PATH'] = os.path.join(os.getcwd(), "cuda-checkpoint") + ":" + env["PATH"] sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], - env=dict(os.environ, CR_CT_TEST_INFO=arg), + env=env, stdout=log, stderr=subprocess.STDOUT, close_fds=True) @@ -2871,6 +2880,9 @@ def get_cli_args(): choices=['amdgpu', 'cuda'], nargs='+', default=None) + rp.add_argument("--mocked-cuda-checkpoint", + action="store_true", + help="Run criu with the cuda plugin and the mocked cuda-checkpoint tool") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) From ad0b1969b603b2ff758d277d4b876077bea6a22d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 12 Aug 2024 07:56:28 -0700 Subject: [PATCH 286/321] criu/plugin: don't call plugin device hooks for non-alive tasks Dead tasks don't hold any resources. Fixes: 2465 Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 4 +++- criu/seize.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b95d4f134b..4d4dfbe6fe 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2238,8 +2238,10 @@ static int restore_root_task(struct pstree_item *init) * mapped memory) could be done sanely once the pie code hands * over the control to master process. */ + pr_info("Run late stage hook from criu master for external devices\n"); for_each_pstree_item(item) { - pr_info("Run late stage hook from criu master for external devices\n"); + if (!task_alive(item)) + continue; ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real); /* * This may not really be an error. Only certain plugin hooks diff --git a/criu/seize.c b/criu/seize.c index ae270022f7..ba26072e6e 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1030,6 +1030,8 @@ int collect_pstree(void) } for_each_pstree_item(iter) { + if (!task_alive(iter)) + continue; ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); if (ret < 0 && ret != -ENOTSUP) goto err; From 8fce2b1adcaabf2b478395cc6d0e18e0cc7d8fad Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 12 Aug 2024 08:08:10 -0700 Subject: [PATCH 287/321] scripts/ci: run tests with the mocked cuda-checkpoint tool Signed-off-by: Andrei Vagin --- scripts/ci/run-ci-tests.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 950453c0d4..26ea00c537 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -359,10 +359,8 @@ make amdgpu_plugin make -C plugins/amdgpu/ test_topology_remap ./plugins/amdgpu/test_topology_remap -./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin cuda -./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin amdgpu -./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda -./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin cuda -./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin amdgpu -./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint From f1cb86856676eab6bb8b34663d8310d3dae638a3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 9 Jul 2024 13:07:03 +0100 Subject: [PATCH 288/321] plugins/amdgpu: fix cross-compilation To enable cross-compile we need to use the CC definition from criu/scripts/nmk/scripts/tools.mk: CC := $(CROSS_COMPILE)$(HOSTCC) Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 6dad001228..7d3388b80e 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -15,7 +15,6 @@ DEPS_NOK := ; __nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk -CC := gcc PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu From 870025c515543220e981fdd29ceb2d68d6476a34 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 9 Jul 2024 13:15:10 +0100 Subject: [PATCH 289/321] ci: enable cross compile testing for amdgpu-plugin Skip cross-compilation on armv7 because, among many other errors, it fails with the following: In file included from ../../include/common/lock.h:9, from ../../criu/include/files.h:9, from amdgpu_plugin.c:30: ../../include/common/asm/atomic.h:60:2: error: #error ARM architecture version (CONFIG_ARMV*) not set or unsupported. 60 | #error ARM architecture version (CONFIG_ARMV*) not set or unsupported. | ^~~~~ ../../include/common/asm/atomic.h: In function 'atomic_add_return': ../../include/common/asm/atomic.h:81:9: error: implicit declaration of function 'smp_mb' [-Werror=implicit-function-declaration] 81 | smp_mb(); | ^~~~~~ Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.stable-cross.tmpl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 6a68cd1ca6..078372c38c 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -21,7 +21,8 @@ RUN apt-install \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} + libnl-route-3-dev:${DEBIAN_ARCH} \ + libdrm-dev:${DEBIAN_ARCH} ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ @@ -39,4 +40,10 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +# amdgpu_plugin with armv7 is not supported +RUN make mrproper && date && \ + make -j $(nproc) && \ + if [ "$SUBARCH" != "armv7" ]; then \ + make -j $(nproc) amdgpu_plugin; \ + fi && \ + make -j $(nproc) zdtm && date From 5df8f864d99b697a02be7096d9a744a49105f010 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 21 Jul 2024 12:02:40 +0100 Subject: [PATCH 290/321] plugins/amdgpu: use C99-standard types Co-developed-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/kfd_ioctl.h | 350 ++++++++++++++++++------------------- 1 file changed, 175 insertions(+), 175 deletions(-) diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index e1ebb75a3a..1a3bcea955 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -39,8 +39,8 @@ #define KFD_IOCTL_MINOR_VERSION 8 struct kfd_ioctl_get_version_args { - __u32 major_version; /* from KFD */ - __u32 minor_version; /* from KFD */ + uint32_t major_version; /* from KFD */ + uint32_t minor_version; /* from KFD */ }; /* For kfd_ioctl_create_queue_args.queue_type. */ @@ -53,51 +53,51 @@ struct kfd_ioctl_get_version_args { #define KFD_MAX_QUEUE_PRIORITY 15 struct kfd_ioctl_create_queue_args { - __u64 ring_base_address; /* to KFD */ - __u64 write_pointer_address; /* from KFD */ - __u64 read_pointer_address; /* from KFD */ - __u64 doorbell_offset; /* from KFD */ + uint64_t ring_base_address; /* to KFD */ + uint64_t write_pointer_address; /* from KFD */ + uint64_t read_pointer_address; /* from KFD */ + uint64_t doorbell_offset; /* from KFD */ - __u32 ring_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 queue_type; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ - __u32 queue_id; /* from KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t queue_type; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ + uint32_t queue_id; /* from KFD */ - __u64 eop_buffer_address; /* to KFD */ - __u64 eop_buffer_size; /* to KFD */ - __u64 ctx_save_restore_address; /* to KFD */ - __u32 ctx_save_restore_size; /* to KFD */ - __u32 ctl_stack_size; /* to KFD */ + uint64_t eop_buffer_address; /* to KFD */ + uint64_t eop_buffer_size; /* to KFD */ + uint64_t ctx_save_restore_address; /* to KFD */ + uint32_t ctx_save_restore_size; /* to KFD */ + uint32_t ctl_stack_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { - __u32 queue_id; /* to KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_update_queue_args { - __u64 ring_base_address; /* to KFD */ + uint64_t ring_base_address; /* to KFD */ - __u32 queue_id; /* to KFD */ - __u32 ring_size; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ }; struct kfd_ioctl_set_cu_mask_args { - __u32 queue_id; /* to KFD */ - __u32 num_cu_mask; /* to KFD */ - __u64 cu_mask_ptr; /* to KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t num_cu_mask; /* to KFD */ + uint64_t cu_mask_ptr; /* to KFD */ }; struct kfd_ioctl_get_queue_wave_state_args { - __u64 ctl_stack_address; /* to KFD */ - __u32 ctl_stack_used_size; /* from KFD */ - __u32 save_area_used_size; /* from KFD */ - __u32 queue_id; /* to KFD */ - __u32 pad; + uint64_t ctl_stack_address; /* to KFD */ + uint32_t ctl_stack_used_size; /* from KFD */ + uint32_t save_area_used_size; /* from KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ @@ -105,13 +105,13 @@ struct kfd_ioctl_get_queue_wave_state_args { #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 struct kfd_ioctl_set_memory_policy_args { - __u64 alternate_aperture_base; /* to KFD */ - __u64 alternate_aperture_size; /* to KFD */ + uint64_t alternate_aperture_base; /* to KFD */ + uint64_t alternate_aperture_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 default_policy; /* to KFD */ - __u32 alternate_policy; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t default_policy; /* to KFD */ + uint32_t alternate_policy; /* to KFD */ + uint32_t pad; }; /* @@ -122,24 +122,24 @@ struct kfd_ioctl_set_memory_policy_args { */ struct kfd_ioctl_get_clock_counters_args { - __u64 gpu_clock_counter; /* from KFD */ - __u64 cpu_clock_counter; /* from KFD */ - __u64 system_clock_counter; /* from KFD */ - __u64 system_clock_freq; /* from KFD */ + uint64_t gpu_clock_counter; /* from KFD */ + uint64_t cpu_clock_counter; /* from KFD */ + uint64_t system_clock_counter; /* from KFD */ + uint64_t system_clock_freq; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_process_device_apertures { - __u64 lds_base; /* from KFD */ - __u64 lds_limit; /* from KFD */ - __u64 scratch_base; /* from KFD */ - __u64 scratch_limit; /* from KFD */ - __u64 gpuvm_base; /* from KFD */ - __u64 gpuvm_limit; /* from KFD */ - __u32 gpu_id; /* from KFD */ - __u32 pad; + uint64_t lds_base; /* from KFD */ + uint64_t lds_limit; /* from KFD */ + uint64_t scratch_base; /* from KFD */ + uint64_t scratch_limit; /* from KFD */ + uint64_t gpuvm_base; /* from KFD */ + uint64_t gpuvm_limit; /* from KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t pad; }; /* @@ -152,20 +152,20 @@ struct kfd_ioctl_get_process_apertures_args { struct kfd_process_device_apertures process_apertures[NUM_OF_SUPPORTED_GPUS]; /* from KFD */ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; }; struct kfd_ioctl_get_process_apertures_new_args { /* User allocated. Pointer to struct kfd_process_device_apertures * filled in by Kernel */ - __u64 kfd_process_device_apertures_ptr; + uint64_t kfd_process_device_apertures_ptr; /* to KFD - indicates amount of memory present in kfd_process_device_apertures_ptr * from KFD - Number of entries filled by KFD. */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; }; #define MAX_ALLOWED_NUM_POINTS 100 @@ -173,25 +173,25 @@ struct kfd_ioctl_get_process_apertures_new_args { #define MAX_ALLOWED_WAC_BUFF_SIZE 128 struct kfd_ioctl_dbg_register_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_unregister_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_address_watch_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; struct kfd_ioctl_dbg_wave_control_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; #define KFD_INVALID_FD 0xffffffff @@ -228,43 +228,43 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_MEM_ERR_GPU_HANG 3 struct kfd_ioctl_create_event_args { - __u64 event_page_offset; /* from KFD */ - __u32 event_trigger_data; /* from KFD - signal events only */ - __u32 event_type; /* to KFD */ - __u32 auto_reset; /* to KFD */ - __u32 node_id; /* to KFD - only valid for certain event types */ - __u32 event_id; /* from KFD */ - __u32 event_slot_index; /* from KFD */ + uint64_t event_page_offset; /* from KFD */ + uint32_t event_trigger_data; /* from KFD - signal events only */ + uint32_t event_type; /* to KFD */ + uint32_t auto_reset; /* to KFD */ + uint32_t node_id; /* to KFD - only valid for certain event types */ + uint32_t event_id; /* from KFD */ + uint32_t event_slot_index; /* from KFD */ }; struct kfd_ioctl_destroy_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_set_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_reset_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_memory_exception_failure { - __u32 NotPresent; /* Page not present or supervisor privilege */ - __u32 ReadOnly; /* Write access to a read-only page */ - __u32 NoExecute; /* Execute access to a page marked NX */ - __u32 imprecise; /* Can't determine the exact fault address */ + uint32_t NotPresent; /* Page not present or supervisor privilege */ + uint32_t ReadOnly; /* Write access to a read-only page */ + uint32_t NoExecute; /* Execute access to a page marked NX */ + uint32_t imprecise; /* Can't determine the exact fault address */ }; /* memory exception data */ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; - __u64 va; - __u32 gpu_id; - __u32 ErrorType; /* 0 = no RAS error, + uint64_t va; + uint32_t gpu_id; + uint32_t ErrorType; /* 0 = no RAS error, * 1 = ECC_SRAM, * 2 = Link_SYNFLOOD (poison), * 3 = GPU hang (not attributable to a specific cause), @@ -274,10 +274,10 @@ struct kfd_hsa_memory_exception_data { /* hw exception data */ struct kfd_hsa_hw_exception_data { - __u32 reset_type; - __u32 reset_cause; - __u32 memory_lost; - __u32 gpu_id; + uint32_t reset_type; + uint32_t reset_cause; + uint32_t memory_lost; + uint32_t gpu_id; }; /* Event data */ @@ -286,57 +286,57 @@ struct kfd_event_data { struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; }; /* From KFD */ - __u64 kfd_event_data_ext; /* pointer to an extension structure for future exception types */ - __u32 event_id; /* to KFD */ - __u32 pad; + uint64_t kfd_event_data_ext; /* pointer to an extension structure for future exception types */ + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_wait_events_args { - __u64 events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - __u32 num_events; /* to KFD */ - __u32 wait_for_all; /* to KFD */ - __u32 timeout; /* to KFD */ - __u32 wait_result; /* from KFD */ + uint64_t events_ptr; /* pointed to struct kfd_event_data array, to KFD */ + uint32_t num_events; /* to KFD */ + uint32_t wait_for_all; /* to KFD */ + uint32_t timeout; /* to KFD */ + uint32_t wait_result; /* from KFD */ }; struct kfd_ioctl_set_scratch_backing_va_args { - __u64 va_addr; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint64_t va_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_get_tile_config_args { /* to KFD: pointer to tile array */ - __u64 tile_config_ptr; + uint64_t tile_config_ptr; /* to KFD: pointer to macro tile array */ - __u64 macro_tile_config_ptr; + uint64_t macro_tile_config_ptr; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - __u32 num_tile_configs; + uint32_t num_tile_configs; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - __u32 num_macro_tile_configs; + uint32_t num_macro_tile_configs; - __u32 gpu_id; /* to KFD */ - __u32 gb_addr_config; /* from KFD */ - __u32 num_banks; /* from KFD */ - __u32 num_ranks; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t gb_addr_config; /* from KFD */ + uint32_t num_banks; /* from KFD */ + uint32_t num_ranks; /* from KFD */ /* struct size can be extended later if needed without breaking ABI compatibility */ }; struct kfd_ioctl_set_trap_handler_args { - __u64 tba_addr; /* to KFD */ - __u64 tma_addr; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint64_t tba_addr; /* to KFD */ + uint64_t tma_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_acquire_vm_args { - __u32 drm_fd; /* to KFD */ - __u32 gpu_id; /* to KFD */ + uint32_t drm_fd; /* to KFD */ + uint32_t gpu_id; /* to KFD */ }; /* Allocation flags: memory types */ @@ -367,12 +367,12 @@ struct kfd_ioctl_acquire_vm_args { * @flags: memory type and attributes. See KFD_IOC_ALLOC_MEM_FLAGS above */ struct kfd_ioctl_alloc_memory_of_gpu_args { - __u64 va_addr; /* to KFD */ - __u64 size; /* to KFD */ - __u64 handle; /* from KFD */ - __u64 mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ - __u32 gpu_id; /* to KFD */ - __u32 flags; + uint64_t va_addr; /* to KFD */ + uint64_t size; /* to KFD */ + uint64_t handle; /* from KFD */ + uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ + uint32_t gpu_id; /* to KFD */ + uint32_t flags; }; /* Free memory allocated with kfd_ioctl_alloc_memory_of_gpu @@ -380,13 +380,13 @@ struct kfd_ioctl_alloc_memory_of_gpu_args { * @handle: memory handle returned by alloc */ struct kfd_ioctl_free_memory_of_gpu_args { - __u64 handle; /* to KFD */ + uint64_t handle; /* to KFD */ }; /* Map memory to one or more GPUs * * @handle: memory handle returned by alloc - * @device_ids_array_ptr: array of gpu_ids (__u32 per device) + * @device_ids_array_ptr: array of gpu_ids (uint32_t per device) * @n_devices: number of devices in the array * @n_success: number of devices mapped successfully * @@ -399,10 +399,10 @@ struct kfd_ioctl_free_memory_of_gpu_args { * n_devices. */ struct kfd_ioctl_map_memory_to_gpu_args { - __u64 handle; /* to KFD */ - __u64 device_ids_array_ptr; /* to KFD */ - __u32 n_devices; /* to KFD */ - __u32 n_success; /* to/from KFD */ + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t n_devices; /* to KFD */ + uint32_t n_success; /* to/from KFD */ }; /* Unmap memory from one or more GPUs @@ -410,10 +410,10 @@ struct kfd_ioctl_map_memory_to_gpu_args { * same arguments as for mapping */ struct kfd_ioctl_unmap_memory_from_gpu_args { - __u64 handle; /* to KFD */ - __u64 device_ids_array_ptr; /* to KFD */ - __u32 n_devices; /* to KFD */ - __u32 n_success; /* to/from KFD */ + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t n_devices; /* to KFD */ + uint32_t n_success; /* to/from KFD */ }; /* Allocate GWS for specific queue @@ -424,28 +424,28 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { * only support contiguous GWS allocation */ struct kfd_ioctl_alloc_queue_gws_args { - __u32 queue_id; /* to KFD */ - __u32 num_gws; /* to KFD */ - __u32 first_gws; /* from KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t num_gws; /* to KFD */ + uint32_t first_gws; /* from KFD */ + uint32_t pad; }; struct kfd_ioctl_get_dmabuf_info_args { - __u64 size; /* from KFD */ - __u64 metadata_ptr; /* to KFD */ - __u32 metadata_size; /* to KFD (space allocated by user) + uint64_t size; /* from KFD */ + uint64_t metadata_ptr; /* to KFD */ + uint32_t metadata_size; /* to KFD (space allocated by user) * from KFD (actual metadata size) */ - __u32 gpu_id; /* from KFD */ - __u32 flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ - __u32 dmabuf_fd; /* to KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ + uint32_t dmabuf_fd; /* to KFD */ }; struct kfd_ioctl_import_dmabuf_args { - __u64 va_addr; /* to KFD */ - __u64 handle; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 dmabuf_fd; /* to KFD */ + uint64_t va_addr; /* to KFD */ + uint64_t handle; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t dmabuf_fd; /* to KFD */ }; /* @@ -463,8 +463,8 @@ enum kfd_smi_event { #define KFD_SMI_EVENT_MSG_SIZE 96 struct kfd_ioctl_smi_events_args { - __u32 gpuid; /* to KFD */ - __u32 anon_fd; /* from KFD */ + uint32_t gpuid; /* to KFD */ + uint32_t anon_fd; /* from KFD */ }; /************************************************************************************************** @@ -510,33 +510,33 @@ enum kfd_criu_op { * Return: 0 on success, -errno on failure */ struct kfd_ioctl_criu_args { - __u64 devices; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 bos; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 priv_data; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 pid; /* Used during ops: PROCESS_INFO, RESUME */ - __u32 op; + uint64_t devices; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t bos; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t priv_data; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t pid; /* Used during ops: PROCESS_INFO, RESUME */ + uint32_t op; }; struct kfd_criu_device_bucket { - __u32 user_gpu_id; - __u32 actual_gpu_id; - __u32 drm_fd; - __u32 pad; + uint32_t user_gpu_id; + uint32_t actual_gpu_id; + uint32_t drm_fd; + uint32_t pad; }; struct kfd_criu_bo_bucket { - __u64 addr; - __u64 size; - __u64 offset; - __u64 restored_offset; /* During restore, updated offset for BO */ - __u32 gpu_id; /* This is the user_gpu_id */ - __u32 alloc_flags; - __u32 dmabuf_fd; - __u32 pad; + uint64_t addr; + uint64_t size; + uint64_t offset; + uint64_t restored_offset; /* During restore, updated offset for BO */ + uint32_t gpu_id; /* This is the user_gpu_id */ + uint32_t alloc_flags; + uint32_t dmabuf_fd; + uint32_t pad; }; /* CRIU IOCTLs - END */ @@ -616,8 +616,8 @@ enum kfd_ioctl_svm_attr_type { * @value: attribute value */ struct kfd_ioctl_svm_attribute { - __u32 type; - __u32 value; + uint32_t type; + uint32_t value; }; /** @@ -659,10 +659,10 @@ struct kfd_ioctl_svm_attribute { * attribute type to indicate the access for the specified GPU. */ struct kfd_ioctl_svm_args { - __u64 start_addr; - __u64 size; - __u32 op; - __u32 nattr; + uint64_t start_addr; + uint64_t size; + uint32_t op; + uint32_t nattr; /* Variable length array of attributes */ struct kfd_ioctl_svm_attribute attrs[0]; }; From 011529826973e2717d709651d2ec91e0d65e3151 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 10 Jul 2024 04:36:18 +0100 Subject: [PATCH 291/321] plugins/amdgpu: fix printf format specifiers Errors on aarch64: In file included from amdgpu_plugin_drm.h:10, from amdgpu_plugin.c:33: amdgpu_plugin.c: In function 'amdgpu_plugin_dump_file': amdgpu_plugin_util.h:24:20: error: format '%lld' expects argument of type 'long long int', but argument 6 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin.c:1236:9: note: in expansion of macro 'pr_info' 1236 | pr_info("devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, args.num_objects, | ^~~~~~~ cc1: all warnings being treated as errors Errors on ppc64: In file included from amdgpu_plugin_drm.h:10, from amdgpu_plugin.c:33: amdgpu_plugin.c: In function 'amdgpu_plugin_dump_file': amdgpu_plugin_util.h:24:20: error: format '%llu' expects argument of type 'long long unsigned int', but argument 6 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin.c:1236:9: note: in expansion of macro 'pr_info' 1236 | pr_info("devices:%u bos:%u objects:%u priv_data:%llu\n", | ^~~~~~~ cc1: all warnings being treated as errors In file included from amdgpu_plugin_util.c:38: amdgpu_plugin_util.c: In function 'print_kfd_bo_stat': amdgpu_plugin_util.h:24:20: error: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin_util.c:196:17: note: in expansion of macro 'pr_info' 196 | pr_info("%s(), %d. KFD BO Addr: %llx \n", __func__, idx, bo->addr); | ^~~~~~~ amdgpu_plugin_util.h:24:20: error: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin_util.c:197:17: note: in expansion of macro 'pr_info' 197 | pr_info("%s(), %d. KFD BO Size: %llx \n", __func__, idx, bo->size); | ^~~~~~~ amdgpu_plugin_util.h:24:20: error: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin_util.c:198:17: note: in expansion of macro 'pr_info' 198 | pr_info("%s(), %d. KFD BO Offset: %llx \n", __func__, idx, bo->offset); | ^~~~~~~ amdgpu_plugin_util.h:24:20: error: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin_util.c:199:17: note: in expansion of macro 'pr_info' 199 | pr_info("%s(), %d. KFD BO Restored Offset: %llx \n", __func__, idx, bo->restored_offset); | ^~~~~~~ cc1: all warnings being treated as errors Co-developed-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 4 ++-- plugins/amdgpu/amdgpu_plugin_util.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index b73b5101db..707aea5a98 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1233,8 +1233,8 @@ int amdgpu_plugin_dump_file(int fd, int id) goto exit; } - pr_info("devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, args.num_objects, - args.priv_data_size); + pr_info("devices:%" PRIu32 " bos:%" PRIu32 " objects:%" PRIu32 " priv_data:%" PRIu64 "\n", + args.num_devices, args.num_bos, args.num_objects, args.priv_data_size); e = xmalloc(sizeof(*e)); if (!e) { diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index 62e569fc85..a165fc9cd5 100755 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -193,10 +193,10 @@ void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) for (int idx = 0; idx < bo_cnt; idx++) { bo = &bo_list[idx]; pr_info("\n"); - pr_info("%s(), %d. KFD BO Addr: %llx \n", __func__, idx, bo->addr); - pr_info("%s(), %d. KFD BO Size: %llx \n", __func__, idx, bo->size); - pr_info("%s(), %d. KFD BO Offset: %llx \n", __func__, idx, bo->offset); - pr_info("%s(), %d. KFD BO Restored Offset: %llx \n", __func__, idx, bo->restored_offset); + pr_info("%s(), %d. KFD BO Addr: %" PRIx64 " \n", __func__, idx, bo->addr); + pr_info("%s(), %d. KFD BO Size: %" PRIx64 " \n", __func__, idx, bo->size); + pr_info("%s(), %d. KFD BO Offset: %" PRIx64 " \n", __func__, idx, bo->offset); + pr_info("%s(), %d. KFD BO Restored Offset: %" PRIx64 " \n", __func__, idx, bo->restored_offset); pr_info("%s(), %d. KFD BO Alloc Flags: %x \n", __func__, idx, bo->alloc_flags); pr_info("%s(), %d. KFD BO Gpu ID: %x \n", __func__, idx, bo->gpu_id); pr_info("%s(), %d. KFD BO Dmabuf FD: %x \n", __func__, idx, bo->dmabuf_fd); From 47f81cd75156352ad4feef42a2839fd92dc67e11 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 5 Jul 2024 08:22:55 +0000 Subject: [PATCH 292/321] crit: do not crash on aarch64 doing 'crit x ./ rss' Running 'crit x ./ rss' on aarch64 crashes with: File "/home/criu/crit/crit/__main__.py", line 331, in explore_rss while vmas[vmi]['start'] < pme: ~~~~^^^^^ IndexError: list index out of range This adds an additional check to the while loop to do access indexes out of range. Signed-off-by: Adrian Reber --- crit/crit/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crit/crit/__main__.py b/crit/crit/__main__.py index e15327f503..bce5234456 100755 --- a/crit/crit/__main__.py +++ b/crit/crit/__main__.py @@ -323,12 +323,12 @@ def explore_rss(opts): pvmi = -1 for pm in pms[1:]: pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) - while vmas[vmi]['end'] <= pm['vaddr']: + while vmi < len(vmas) and vmas[vmi]['end'] <= pm['vaddr']: vmi += 1 pme = pm['vaddr'] + (pm['nr_pages'] << 12) vstr = '' - while vmas[vmi]['start'] < pme: + while vmi < len(vmas) and vmas[vmi]['start'] < pme: vma = vmas[vmi] if vmi == pvmi: vstr += ' ~' From 190216a532f9e1ec8686e953e64c2f8f5438db62 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 8 Jul 2024 16:50:23 +0000 Subject: [PATCH 293/321] test: better test for SELinux tools Previously the check was just if /sys/fs/selinux is mounted. This extends the check to see if all necessary tools are installed. Signed-off-by: Adrian Reber --- test/zdtm/static/selinux00.checkskip | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/zdtm/static/selinux00.checkskip b/test/zdtm/static/selinux00.checkskip index 8d946a75e3..4c85647d10 100755 --- a/test/zdtm/static/selinux00.checkskip +++ b/test/zdtm/static/selinux00.checkskip @@ -2,6 +2,19 @@ test -d /sys/fs/selinux || exit 1 +# check if necessary commands are installed +if ! command -v setenforce &>/dev/null; then + exit 1 +fi + +if ! command -v setsebool &>/dev/null; then + exit 1 +fi + +if ! command -v getsebool &>/dev/null; then + exit 1 +fi + # See selinux00.hook for details getsebool unconfined_dyntrans_all > /dev/null 2>&1 From bc88db290c4f0d3f7e7728ca5d01dc8386ff0df8 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 8 Jul 2024 16:51:28 +0000 Subject: [PATCH 294/321] test: only run macvlan tests if macvlan devices can be created Some test environments (Actuated runners for example) do not support maclvan devices. Skip tests depending on it automatically. Signed-off-by: Adrian Reber --- test/others/ns_ext/run.sh | 5 ++++ test/zdtm/static/macvlan.checkskip | 38 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100755 test/zdtm/static/macvlan.checkskip diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index 4ebe3e2801..5d1e139d73 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -2,6 +2,11 @@ set -x +if ! ../../zdtm/static/macvlan.checkskip; then + echo "No macvlan support. Skipping" + exit 0 +fi + if [[ "$1" == "pid" ]]; then NS=pid else diff --git a/test/zdtm/static/macvlan.checkskip b/test/zdtm/static/macvlan.checkskip new file mode 100755 index 0000000000..f4e0609536 --- /dev/null +++ b/test/zdtm/static/macvlan.checkskip @@ -0,0 +1,38 @@ +#!/bin/bash + +FAIL=0 + +create_macvlan_device() { + if ! ip link add test_mvlan1 type veth >/dev/null 2>&1; then + FAIL=1 + fi + if ! ip link add mymacvlan1 link test_mvlan1 type macvlan >/dev/null 2>&1; then + FAIL=1 + fi + + return "${FAIL}" +} + +cleanup() { + ip link del test_mvlan1 >/dev/null 2>&1 + ip link del mymacvlan1 >/dev/null 2>&1 +} + +trap "cleanup" QUIT TERM INT HUP EXIT + +# Test once without loading the module +if create_macvlan_device; then + exit 0 +fi + +# Test once more with explicitly loading the module +if ! modprobe macvlan >/dev/null 2>&1; then + exit 1 +fi +create_macvlan_device + +if [ "${FAIL}" == "1" ]; then + exit 1 +fi + +exit 0 From e67b428437b6f5ea9a5ada768df82b098f878a81 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 8 Jul 2024 17:02:09 +0000 Subject: [PATCH 295/321] coredump: fail on unsupported architectures early Currently coredump only works on x86_64. Fail early on any other architecture. Signed-off-by: Adrian Reber --- coredump/coredump | 5 +++++ test/others/criu-coredump/test.sh | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/coredump/coredump b/coredump/coredump index f70d37c13b..3fbdafe81c 100755 --- a/coredump/coredump +++ b/coredump/coredump @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import platform import argparse import os import sys @@ -36,6 +37,10 @@ def main(): opts = vars(parser.parse_args()) + if platform.machine() != 'x86_64': + print('ERROR: %s only supported on x86_64' % sys.argv[0]) + sys.exit(1) + try: coredump(opts) except SystemExit as error: diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index eec2b817f4..4399044d71 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -43,5 +43,13 @@ function run_test { echo "= done" } +UNAME_M=$(uname -m) + +if [ "$UNAME_M" != "x86_64" ]; then + # the criu-coredump script is only x86_64 aware + echo "criu-coredump only support x86_64. skipping." + exit 0 +fi + gen_imgs run_test From 5ba1f84f84aee77e009a509a9799a68fb206e1c1 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 4 Jul 2024 12:09:06 +0000 Subject: [PATCH 296/321] ci: run aarch64 tests native via actuated Signed-off-by: Adrian Reber --- .github/workflows/actuated-aarch64-test.yaml | 52 ++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/actuated-aarch64-test.yaml diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml new file mode 100644 index 0000000000..8b0a63fc7b --- /dev/null +++ b/.github/workflows/actuated-aarch64-test.yaml @@ -0,0 +1,52 @@ +name: Actuated aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: actuated-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + # Actuated runners are not available in all repositories. + if: ${{ github.repository == 'checkpoint-restore/criu' }} + # The memory size and the number of CPUs can be freely selected. + # 3GB and 4 CPUs seems to be enough according to the result from 'vmmeter'. + runs-on: actuated-arm64-4cpu-3gb + strategy: + matrix: + target: [GCC=1, CLANG=1] + + steps: + # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md + # vmmeter start + - name: Prepare arkade + uses: alexellis/arkade-get@master + with: + crane: latest + print-summary: false + + - name: Install vmmeter + run: | + crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin + + - name: Run vmmeter + uses: self-actuated/vmmeter-action@master + # vmmeter end + + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} + # Following tests are failing on the actuated VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" From 5a74eee5b0a073b63a7d9a28c946ac32ddb1ae74 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 16 Aug 2024 22:15:20 +0100 Subject: [PATCH 297/321] cuda: unlock on timeout error When attempting to checkpoint a container with CUDA processes, CRIU could fail with the following error: Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 1 Error (cuda_plugin.c:143): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call Error (cuda_plugin.c:384): cuda_plugin: PAUSE_DEVICES failed with In this situation, the target process is locked, but CRIU fails due to a timeout and exits with an error. We need to make sure that the target PID is unlocked in such case. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 39c78e370b..1745454760 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -4,6 +4,7 @@ #include "cr_options.h" #include "pid.h" #include "proc_parse.h" +#include "seize.h" #include #include @@ -379,18 +380,23 @@ int cuda_plugin_pause_devices(int pid) int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); if (status) { pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); + if (alarm_timeouted()) + goto unlock; return -1; } + if (add_pid_to_buf(&cuda_pids, pid)) { pr_err("unable to track paused pid %d\n", pid); - status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); - } - return -1; + goto unlock; } return 0; +unlock: + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); + } + return -1; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) From 4ca4a0919e970f4409bbe2222e3187fb5ac3f6f1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 17 Aug 2024 14:30:21 +0100 Subject: [PATCH 298/321] scripts/uninstall_module: fix package discovery The `uninstall_module.py` script is a wrapper for the `pip uninstall` command that enables support for specifying installation prefix (i.e., `--prefix`). When this functionality is used, we intentionally set `sys.path` to include only search paths for the specified prefix to avoid unintentional uninstallation of packages in system paths. Since `importlib_metadata` version 8.1.0, the `Distribution.from_name()` method has been modified [1] to perform additional pre-processing of Distribution objects [2] that requires loading distribution metadata and results in the following error: File "/usr/local/lib/python3.12/site-packages/importlib_metadata/__init__.py", line 422, in buckets = bucket(dists, lambda dist: bool(dist.metadata)) ^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/importlib_metadata/__init__.py", line 454, in metadata from . import _adapters File "/usr/local/lib/python3.12/site-packages/importlib_metadata/_adapters.py", line 3, in import email.message File "/usr/lib64/python3.12/email/message.py", line 11, in import quopri ModuleNotFoundError: No module named 'quopri' This error occurs because we have excluded system paths from the list of search paths (`sys.path`). However, this pre-processing is not required for our use case, as we only use the discovery mechanism of importlib_metadata to resolve the metadata directory path of the module being uninstalled. To fix this problem, this patch updates `uninstall_module` to avoid the `from_name()` method and use `discover(name=package_name)` directly. [1] https://github.com/python/importlib_metadata/commit/a65c29adc027b3615154cab73aaedd58a6aa23da [2] https://github.com/python/importlib_metadata/blob/a65c29ad/importlib_metadata/__init__.py#L391 Fixes: #2468 Signed-off-by: Radostin Stoyanov --- scripts/uninstall_module.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py index 439fca18a1..8a9b70892b 100755 --- a/scripts/uninstall_module.py +++ b/scripts/uninstall_module.py @@ -38,8 +38,9 @@ def uninstall_module(package_name: str, prefix=None): if prefix: add_site_dir(prefix) try: - dist_info_path = str(importlib_metadata.distribution(package_name)._path) - except importlib_metadata.PackageNotFoundError: + distribution = next(importlib_metadata.Distribution.discover(name=package_name)) + dist_info_path = str(distribution._path) + except StopIteration: print(f"Skipping {package_name} as it is not installed.") sys.exit(0) From e94c13c96ad9cba34f4f043d3fbd37530bd118fc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 12 Sep 2024 14:30:30 +0100 Subject: [PATCH 299/321] codespell: fix typos This patch fixes the following typos reported by codespell: ./test/others/bers/bers.c:394: dependin ==> depending, depend in ./criu/kerndat.c:837: hitted ==> hit Signed-off-by: Radostin Stoyanov --- criu/kerndat.c | 2 +- test/others/bers/bers.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 1a584fe921..fa1ed21fad 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -834,7 +834,7 @@ static int kerndat_detect_stack_guard_gap(void) * (see kernel commit 1be7107fbe18ee). * * Same time there was semi-complete - * patch released which hitted a number + * patch released which hit a number * of repos (Ubuntu, Fedora) where instead * of PAGE_SIZE the 1M gap is cut off. */ diff --git a/test/others/bers/bers.c b/test/others/bers/bers.c index 37cf84dd3d..b291e3bcbe 100644 --- a/test/others/bers/bers.c +++ b/test/others/bers/bers.c @@ -391,7 +391,7 @@ int main(int argc, char *argv[]) pr_msg(" -f|--files create files for each task\n"); pr_msg(" -m|--memory allocate megabytes for each task\n"); pr_msg(" --memory-chunks split memory to equal parts\n"); - pr_msg(" --mem-fill fill memory with data dependin on :\n"); + pr_msg(" --mem-fill fill memory with data depending on :\n"); pr_msg(" all fill every byte of memory\n"); pr_msg(" light fill first bytes of every page\n"); pr_msg(" dirtify fill every page\n"); From e4026fbe6b58a40e100d19b9dab4294df2b164e2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 11 Sep 2024 17:37:45 -0700 Subject: [PATCH 300/321] criu: Allow disabling freeze cgroups Some plugins (e.g., CUDA) may not function correctly when processes are frozen using cgroups. This change introduces a mechanism to disable the use of freeze cgroups during process seizing, even if explicitly requested via the --freeze-cgroup option. The CUDA plugin is updated to utilize this new mechanism to ensure compatibility. Signed-off-by: Andrei Vagin --- criu/include/seize.h | 1 + criu/seize.c | 66 +++++++++++++++++++++++++++++++------- plugins/cuda/cuda_plugin.c | 2 ++ 3 files changed, 58 insertions(+), 11 deletions(-) diff --git a/criu/include/seize.h b/criu/include/seize.h index 4545bf2627..3225029dd3 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -8,5 +8,6 @@ extern bool alarm_timeouted(void); extern char *task_comm_info(pid_t pid, char *comm, size_t size); extern char *__task_comm_info(pid_t pid); +extern void dont_use_freeze_cgroup(void); #endif diff --git a/criu/seize.c b/criu/seize.c index ba26072e6e..edeb57cc8a 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -25,6 +25,19 @@ #include "xmalloc.h" #include "util.h" +static bool freeze_cgroup_disabled; + +/* + * Disables the use of freeze cgroups for process seizing, even if explicitly + * requested via the --freeze-cgroup option. This is necessary for plugins + * (e.g., CUDA) that do not function correctly when processes are frozen using + * cgroups. + */ +void __attribute__((used)) dont_use_freeze_cgroup(void) +{ + freeze_cgroup_disabled = true; +} + char *task_comm_info(pid_t pid, char *comm, size_t size) { bool is_read = false; @@ -397,7 +410,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup) + if (!opts.freeze_cgroup || freeze_cgroup_disabled) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -492,6 +505,31 @@ static int log_unfrozen_stacks(char *root) return 0; } +static int check_freezer_cgroup(void) +{ + enum freezer_state state = THAWED; + int fd; + + BUG_ON(!freeze_cgroup_disabled); + + fd = freezer_open(); + if (fd < 0) + return -1; + + state = get_freezer_state(fd); + close(fd); + if (state == FREEZER_ERROR) { + return -1; + } + + if (state != THAWED) { + pr_err("One or more plugins are incompatible with the freezer cgroup in the FROZEN state.\n"); + return -1; + } + + return 0; +} + static int freeze_processes(void) { int fd, exit_code = -1; @@ -643,7 +681,7 @@ static int collect_children(struct pstree_item *item) goto free; } - if (!opts.freeze_cgroup) + if (!opts.freeze_cgroup || freeze_cgroup_disabled) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -831,7 +869,8 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) + if ((!opts.freeze_cgroup || freeze_cgroup_disabled) && + compel_interrupt_task(pid)) continue; ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL); @@ -887,7 +926,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup) + if (opts.freeze_cgroup && !freeze_cgroup_disabled) attempts = 1; /* @@ -993,12 +1032,16 @@ int collect_pstree(void) pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && freeze_processes()) - goto err; - - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { - set_cr_errno(ESRCH); - goto err; + if (opts.freeze_cgroup && !freeze_cgroup_disabled) { + if (freeze_processes()) + goto err; + } else { + if (opts.freeze_cgroup && check_freezer_cgroup()) + goto err; + if (compel_interrupt_task(pid)) { + set_cr_errno(ESRCH); + goto err; + } } ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL); @@ -1024,7 +1067,8 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && freezer_wait_processes()) { + if (opts.freeze_cgroup && !freeze_cgroup_disabled && + freezer_wait_processes()) { ret = -1; goto err; } diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 1745454760..04d70b114f 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -483,6 +483,8 @@ int cuda_plugin_init(int stage) INIT_LIST_HEAD(&cuda_pids); } + dont_use_freeze_cgroup(); + return 0; } From 1190f10e5585854c163842b9a07f407ab370c9d9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 12 Sep 2024 11:17:56 -0700 Subject: [PATCH 301/321] fault: allow to check dont_use_freeze_cgroup Adds a new "fault" to call dont_use_freeze_cgroup. --- criu/fault-injection.c | 9 +++++++++ criu/include/fault-injection.h | 1 + criu/include/seize.h | 1 + test/jenkins/criu-fault.sh | 4 ++++ 4 files changed, 15 insertions(+) diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 83dc1fc8d6..2272e6d842 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -1,6 +1,7 @@ #include #include "criu-log.h" #include "fault-injection.h" +#include "seize.h" enum faults fi_strategy; @@ -21,5 +22,13 @@ int fault_injection_init(void) } fi_strategy = start; + + switch (fi_strategy) { + case FI_DISABLE_FREEZE_CGROUP: + dont_use_freeze_cgroup(); + break; + default: + break; + }; return 0; } diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 552ee43389..82c3a1f7fc 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -21,6 +21,7 @@ enum faults { FI_CORRUPT_EXTREGS = 134, FI_DONT_USE_PAGEMAP_SCAN = 135, FI_DUMP_CRASH = 136, + FI_DISABLE_FREEZE_CGROUP = 137, FI_MAX, }; diff --git a/criu/include/seize.h b/criu/include/seize.h index 3225029dd3..f5ea76b16c 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,6 +2,7 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 1fda40a969..fc0eddc2b2 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -40,6 +40,10 @@ fi # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail +# check dont_use_freeze_cgroup +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst + if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then fail fi From 655184757e4fcccc1a6cae1e9f2409a8154a0406 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 14 Sep 2024 09:08:12 -0700 Subject: [PATCH 302/321] plugin/cuda: disable CUDA plugin if /dev/nvidiactl isn't present The presence of /dev/nvidiactl indicates that the system has a compatible NVIDIA GPU driver installed and that the GPU is accessible to the operating system. Signed-off-by: Andrei Vagin --- criu/include/fault-injection.h | 1 + plugins/cuda/cuda_plugin.c | 10 +++++++++- scripts/ci/run-ci-tests.sh | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 82c3a1f7fc..59adf05b9e 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -22,6 +22,7 @@ enum faults { FI_DONT_USE_PAGEMAP_SCAN = 135, FI_DUMP_CRASH = 136, FI_DISABLE_FREEZE_CGROUP = 137, + FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 04d70b114f..23c3f4b1ab 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -5,6 +5,7 @@ #include "pid.h" #include "proc_parse.h" #include "seize.h" +#include "fault-injection.h" #include #include @@ -460,8 +461,15 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_ int cuda_plugin_init(int stage) { - int ret = cuda_checkpoint_supports_flag("--action"); + int ret; + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { + pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); + plugin_disabled = true; + return 0; + } + + ret = cuda_checkpoint_supports_flag("--action"); if (ret == -1) { pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT); plugin_disabled = true; diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 26ea00c537..38b7b5097f 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -363,4 +363,4 @@ make -C plugins/amdgpu/ test_topology_remap ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda -./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint +./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 From 412cdd20031861062740e82f61666012db60c342 Mon Sep 17 00:00:00 2001 From: David Francis Date: Mon, 16 Sep 2024 09:36:25 -0400 Subject: [PATCH 303/321] plugins/amdgpu: Zero ib_info on initialization This struct was being used un-initialized, meaning it was filled with random garbage. Mea culpa. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_plugin.c | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 707aea5a98..b56ba6d140 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -608,6 +608,7 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, while (bytes_remain > 0) { memset(&cs_req, 0, sizeof(cs_req)); memset(&fence, 0, sizeof(fence)); + memset(&ib_info, 0, sizeof(ib_info)); memset(ib, 0, packets_per_buffer * 28); if (type == SDMA_OP_VRAM_WRITE) { From e4518382fabfffb6be86bf5cd449b58faea42a98 Mon Sep 17 00:00:00 2001 From: David Francis Date: Mon, 16 Sep 2024 09:43:12 -0400 Subject: [PATCH 304/321] plugins/amdgpu - Increase maximum parameter length The topology parsing assumed that all parameter names were 30 characters or fewer, but recommended_sdma_engine_id_mask is 31 characters. Make the maximum length a macro, and set it to 64. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_plugin_topology.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index c5fa51fdab..5b4396a0cc 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -20,6 +20,7 @@ #include "amdgpu_plugin_topology.h" #define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/" +#define MAX_PARAMETER_LEN 64 /* User override options */ /* Skip firmware version check */ @@ -417,7 +418,9 @@ struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id static bool get_prop(char *line, char *name, uint64_t *value) { - if (sscanf(line, " %29s %lu", name, value) != 2) + char format[16]; + sprintf(format, " %%%ds %%lu", MAX_PARAMETER_LEN); + if (sscanf(line, format, name, value) != 2) return false; return true; } @@ -437,7 +440,7 @@ static int parse_topo_node_properties(struct tp_node *dev, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -565,7 +568,7 @@ static int parse_topo_node_mem_banks(struct tp_node *node, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -654,7 +657,7 @@ static int parse_topo_node_iolinks(struct tp_node *node, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); From 1079a51f834cb3b52e0152b9d309cd67fa52cb01 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 12 Sep 2024 11:19:35 -0700 Subject: [PATCH 305/321] util: dump fsfd log messages It should help to investigate errors of fsconfig, fsmount and etc. Signed-off-by: Andrei Vagin --- criu/cgroup.c | 18 +++++++----- criu/cr-check.c | 21 +++++++------- criu/include/syscall.h | 17 ------------ criu/include/util.h | 5 ++++ criu/util.c | 62 +++++++++++++++++++++++++++++++++++++++--- 5 files changed, 84 insertions(+), 39 deletions(-) delete mode 100644 criu/include/syscall.h diff --git a/criu/cgroup.c b/criu/cgroup.c index d90b70bb79..fcaed07080 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -28,7 +28,6 @@ #include "images/cgroup.pb-c.h" #include "kerndat.h" #include "linux/mount.h" -#include "syscall.h" /* * This structure describes set of controller groups @@ -581,14 +580,15 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) int fsfd, fd; char *name; - fsfd = sys_fsopen(fstype, 0); + fsfd = cr_fsopen(fstype, 0); if (fsfd < 0) { pr_perror("Unable to open the cgroup file system"); return -1; } if (strstartswith(cc->name, namestr)) { - if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to configure the cgroup (%s) file system", cc->name); goto err; } @@ -596,7 +596,8 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) char *saveptr = NULL, *buf = strdupa(cc->name); name = strtok_r(buf, ",", &saveptr); while (name) { - if (sys_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to configure the cgroup (%s) file system", name); goto err; } @@ -604,14 +605,17 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) } } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to create the cgroup (%s) file system", cc->name); goto err; } - fd = sys_fsmount(fsfd, 0, 0); - if (fd < 0) + fd = cr_fsmount(fsfd, 0, 0); + if (fd < 0) { + fsfd_dump_messages(fsfd); pr_perror("Unable to mount the cgroup (%s) file system", cc->name); + } close(fsfd); return fd; diff --git a/criu/cr-check.c b/criu/cr-check.c index 507f9915ca..0388cbe7fe 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -54,7 +54,6 @@ #include "restorer.h" #include "uffd.h" #include "linux/aio_abi.h" -#include "syscall.h" #include "mount-v2.h" #include "images/inventory.pb-c.h" @@ -1437,18 +1436,18 @@ static int ovl_mount(void) { int tmpfs, fsfd, ovl; - fsfd = sys_fsopen("tmpfs", 0); + fsfd = cr_fsopen("tmpfs", 0); if (fsfd == -1) { pr_perror("Unable to fsopen tmpfs"); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { pr_perror("Unable to create tmpfs mount"); return -1; } - tmpfs = sys_fsmount(fsfd, 0, 0); + tmpfs = cr_fsmount(fsfd, 0, 0); if (tmpfs == -1) { pr_perror("Unable to mount tmpfs"); return -1; @@ -1475,23 +1474,23 @@ static int ovl_mount(void) return -1; } - fsfd = sys_fsopen("overlay", 0); + fsfd = cr_fsopen("overlay", 0); if (fsfd == -1) { pr_perror("Unable to fsopen overlayfs"); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || - sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || - sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || - sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { pr_perror("Unable to configure overlayfs"); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { pr_perror("Unable to create overlayfs"); return -1; } - ovl = sys_fsmount(fsfd, 0, 0); + ovl = cr_fsmount(fsfd, 0, 0); if (ovl == -1) { pr_perror("Unable to mount overlayfs"); return -1; diff --git a/criu/include/syscall.h b/criu/include/syscall.h deleted file mode 100644 index c38d6d971b..0000000000 --- a/criu/include/syscall.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __CR_SYSCALL_H__ -#define __CR_SYSCALL_H__ - -static inline int sys_fsopen(const char *fsname, unsigned int flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} -static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) -{ - return syscall(__NR_fsconfig, fd, cmd, key, value, aux); -} -static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) -{ - return syscall(__NR_fsmount, fd, flags, attr_flags); -} - -#endif /* __CR_SYSCALL_H__ */ \ No newline at end of file diff --git a/criu/include/util.h b/criu/include/util.h index 435469e1ec..ae293a68c8 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -387,6 +387,11 @@ static inline void print_stack_trace(pid_t pid) extern int mount_detached_fs(const char *fsname); +extern int cr_fsopen(const char *fsname, unsigned int flags); +extern int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux); +extern int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags); +extern void fsfd_dump_messages(int fd); + extern char *get_legacy_iptables_bin(bool ipv6, bool restore); extern int set_opts_cap_eff(void); diff --git a/criu/util.c b/criu/util.c index 7dfa1fe424..d2bc9a8657 100644 --- a/criu/util.c +++ b/criu/util.c @@ -39,7 +39,6 @@ #include "mem.h" #include "namespaces.h" #include "criu-log.h" -#include "syscall.h" #include "util-caps.h" #include "clone-noasan.h" @@ -1556,23 +1555,78 @@ void print_stack_trace(pid_t pid) } #endif +int cr_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + int ret = syscall(__NR_fsconfig, fd, cmd, key, value, aux); + if (ret) + fsfd_dump_messages(fd); + return ret; +} + +int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + int ret = syscall(__NR_fsmount, fd, flags, attr_flags); + if (ret) + fsfd_dump_messages(fd); + return ret; +} + +void fsfd_dump_messages(int fd) +{ + char buf[4096]; + int err, n; + + err = errno; + + for (;;) { + n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) { + if (errno != ENODATA) + pr_perror("Unable to read from fs descriptor"); + break; + } + buf[n] = 0; + + switch (buf[0]) { + case 'w': + pr_warn("%s\n", buf); + break; + case 'i': + pr_info("%s\n", buf); + break; + case 'e': + /* fallthrough */ + default: + pr_err("%s\n", buf); + break; + } + } + + errno = err; +} + int mount_detached_fs(const char *fsname) { int fsfd, fd; - fsfd = sys_fsopen(fsname, 0); + fsfd = cr_fsopen(fsname, 0); if (fsfd < 0) { pr_perror("Unable to open the %s file system", fsname); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { pr_perror("Unable to create the %s file system", fsname); close(fsfd); return -1; } - fd = sys_fsmount(fsfd, 0, 0); + fd = cr_fsmount(fsfd, 0, 0); if (fd < 0) pr_perror("Unable to mount the %s file system", fsname); close(fsfd); From 34e7134ff5bcb7bb2f50f1b70484266535a4eca6 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 26 Sep 2024 12:24:52 +0100 Subject: [PATCH 306/321] amdgpu: remove exec permissions on source files This patch fixes the following warnings that appear when building an RPM package: + /usr/lib/rpm/redhat/brp-mangle-shebangs *** WARNING: ./usr/src/debug/criu-4.0-1.fc42.x86_64/plugins/amdgpu/amdgpu_plugin_util.c is executable but has no shebang, removing executable bit *** WARNING: ./usr/src/debug/criu-4.0-1.fc42.x86_64/plugins/amdgpu/amdgpu_plugin_util.h is executable but has no shebang, removing executable bit Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_util.c | 0 plugins/amdgpu/amdgpu_plugin_util.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 plugins/amdgpu/amdgpu_plugin_util.c mode change 100755 => 100644 plugins/amdgpu/amdgpu_plugin_util.h diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c old mode 100755 new mode 100644 diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h old mode 100755 new mode 100644 From 55c8917ca7d7287a29afb40036145b08aa61da56 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 26 Sep 2024 10:59:32 +0100 Subject: [PATCH 307/321] Makefile.config: set CR_PLUGIN_DEFAULT variable By default, CRIU uses the path "/usr/lib/criu" to install and load plugins at runtime. This path is defined by the `PLUGINDIR` variable in Makefile.install and `CR_PLUGIN_DEFAULT` in `criu/include/plugin.h`. However, some distribution packages might install the CRIU plugins at "/usr/lib64/criu" instead. This patch updates the makefile to align the path defined by `CR_PLUGIN_DEFAULT` with the value of `PLUGINDIR`. Signed-off-by: Radostin Stoyanov --- Makefile.config | 4 ++++ plugins/amdgpu/Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile.config b/Makefile.config index 52c250b21c..5ab689d411 100644 --- a/Makefile.config +++ b/Makefile.config @@ -59,6 +59,10 @@ endif export LIBS += $(LIBS_FEATURES) +ifneq ($(PLUGINDIR),) + FEATURE_DEFINES += -DCR_PLUGIN_DEFAULT="\"$(PLUGINDIR)\"" +endif + CONFIG_FILE = .config $(CONFIG_FILE): diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 7d3388b80e..a20d1d1639 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -15,7 +15,7 @@ DEPS_NOK := ; __nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" +PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu ifeq ($(CONFIG_AMDGPU),y) From 71b427ae27808a4db770d79c7df93a32278d3486 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 20:00:09 +0530 Subject: [PATCH 308/321] images: Add protobuf definition for pidfd We only use the last pid from the list in NSpid entry (from /proc//fdinfo/) while restoring pidfds. The last pid refers to the pid of the process in the most deeply nested pid namespace. Since CRIU does not currently support nested pid namespaces, this entry is the one we want. After Linux 6.9, inode numbers can be used to compare pidfds. pidfds referring to the same process will have the same inode numbers. We use inode numbers to restore pidfds that point to dead processes. Signed-off-by: Bhavik Sachdev --- images/Makefile | 1 + images/fdinfo.proto | 3 +++ images/pidfd.proto | 13 +++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 images/pidfd.proto diff --git a/images/Makefile b/images/Makefile index ca85b1a213..855d894da6 100644 --- a/images/Makefile +++ b/images/Makefile @@ -73,6 +73,7 @@ proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o proto-obj-y += rseq.o +proto-obj-y += pidfd.o CFLAGS += -iquote $(obj)/ diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c11860..32ec13cf48 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -17,6 +17,7 @@ import "ext-file.proto"; import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; +import "pidfd.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + PIDFD = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -78,4 +80,5 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; + optional pidfd_entry pidfd = 22; } diff --git a/images/pidfd.proto b/images/pidfd.proto new file mode 100644 index 0000000000..a9da3e4543 --- /dev/null +++ b/images/pidfd.proto @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "fown.proto"; + +message pidfd_entry { + required uint32 id = 1; + required uint32 ino = 2; + required uint32 flags = 3; + required int32 nspid = 4; + required fown_entry fown = 5; +} From d559ebb414f84a3b4d6ef3928bf116ddda3f3b67 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 21:18:51 +0530 Subject: [PATCH 309/321] criu: Support C/R of pidfds Process file descriptors (pidfds) were introduced to provide a stable handle on a process. They solve the problem of pid recycling. For a detailed explanation, see https://lwn.net/Articles/801319/ and http://www.corsix.org/content/what-is-a-pidfd Before Linux 6.9, anonymous inodes were used for the implementation of pidfds. So, we detect them in a fashion similiar to other fd types that use anonymous inodes by calling `readlink()`. After 6.9, pidfs (a file system for pidfds) was introduced. In 6.9 `S_ISREG()` returned true for pidfds, but this again changed with 6.10. (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/pidfs.c?h=v6.11-rc2#n285) After this change, pidfs inodes have no file type in st_mode in userspace. We use `PID_FS_MAGIC` to detect pidfds for kernel >= 6.9 Hence, check for pidfds occurs before the check for regular files. For pidfds that refer to dead processes, we lose the pid of the process as the Pid and NSpid fields in /proc//fdinfo/ change to -1. So, we create a temporary process for each unique inode and open pidfds that refer to this process. After all pidfds have been opened we kill this temporary process. This commit does not include support for pidfds that point to a specific thread, i.e pidfds opened with `PIDFD_THREAD` flag. Fixes: #2258 Signed-off-by: Bhavik Sachdev --- criu/Makefile.crtools | 1 + criu/cr-restore.c | 3 +- criu/files.c | 17 +++ criu/image-desc.c | 1 + criu/include/fs-magic.h | 4 + criu/include/image-desc.h | 1 + criu/include/magic.h | 1 + criu/include/pidfd.h | 16 ++ criu/include/protobuf-desc.h | 1 + criu/pidfd.c | 287 +++++++++++++++++++++++++++++++++++ criu/proc_parse.c | 29 ++++ criu/protobuf-desc.c | 1 + 12 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 criu/include/pidfd.h create mode 100644 criu/pidfd.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 3ddf45cd70..ba6132d2f7 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -101,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4d4dfbe6fe..d5b6c8037a 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -79,6 +79,7 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -280,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ diff --git a/criu/files.c b/criu/files.c index 3b653e24be..a57fb860fb 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -554,6 +557,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1800,5 +1811,11 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); + + if (init_dead_pidfd_hash()) { + pr_err("Could not initialise hash map for dead pidfds\n"); + return -1; + } + return collect_image(&files_cinfo); } diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c0986..2d87c73815 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ad34f48915..ffc0455d5f 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -57,4 +57,8 @@ #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be645..79e1ac1113 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,6 +113,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, diff --git a/criu/include/magic.h b/criu/include/magic.h index 0e8c37234e..6f0aff26d8 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -100,6 +100,7 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 0000000000..4d2d71700e --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern int init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101f..c4241be557 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/pidfd.c b/criu/pidfd.c new file mode 100644 index 0000000000..fdf5dec60e --- /dev/null +++ b/criu/pidfd.c @@ -0,0 +1,287 @@ +#include "common/lock.h" +#include "imgset.h" +#include "pidfd.h" +#include "fdinfo.h" +#include "pidfd.pb-c.h" +#include "protobuf.h" +#include "pstree.h" +#include +#include +#include +#include "common/bug.h" +#include "rst-malloc.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pidfd: " + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + PidfdEntry *pidfe; + struct file_desc d; +}; + +struct dead_pidfd { + unsigned int ino; + int pid; + size_t count; + mutex_t pidfd_lock; + struct hlist_node hash; +}; + +#define DEAD_PIDFD_HASH_SIZE 32 +static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; +static mutex_t *dead_pidfd_hash_lock; + +int init_dead_pidfd_hash(void) +{ + for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); + + dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); + if (!dead_pidfd_hash_lock) + return -1; + + mutex_init(dead_pidfd_hash_lock); + + return 0; +} + +static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) +{ + struct dead_pidfd *dead; + struct hlist_head *chain; + + mutex_lock(dead_pidfd_hash_lock); + chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; + hlist_for_each_entry(dead, chain, hash) { + if (dead->ino == ino) { + mutex_unlock(dead_pidfd_hash_lock); + return dead; + } + } + mutex_unlock(dead_pidfd_hash_lock); + + return NULL; +} + +int is_pidfd_link(char *link) +{ + /* + * pidfs was introduced in Linux 6.9 + * before which anonymous-inodes were used + */ + return is_anon_link_type(link, "[pidfd]"); +} + +static void pr_info_pidfd(char *action, PidfdEntry *pidfe) +{ + pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", + action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino + ); +} + +static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) +{ + struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) + return -1; + + if (p->flags & PIDFD_THREAD) { + pr_err("PIDFD_THREAD flag is currently not supported\n"); + return -1; + } + + /* + * Check if the pid pidfd refers to is part of process tree + * This ensures the process will exist on restore. + */ + if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { + pr_err("pidfd pid %d is not a part of process tree..\n", + pidfd_info.pid); + return -1; + } + + pidfd_info.pidfe.id = id; + pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); + pidfd_info.pidfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIDFD; + fe.id = pidfd_info.pidfe.id; + fe.pidfd = &pidfd_info.pidfe; + + pr_info_pidfd("Dumping", &pidfd_info.pidfe); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops pidfd_dump_ops = { + .type = FD_TYPES__PIDFD, + .dump = dump_one_pidfd, +}; + +static int pidfd_open(pid_t pid, int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int create_tmp_process(void) +{ + int tmp_process; + tmp_process = fork(); + if (tmp_process < 0) { + pr_perror("Could not fork"); + return -1; + } else if (tmp_process == 0) { + while(1) + sleep(1); + } + return tmp_process; +} + +static int free_dead_pidfd(struct dead_pidfd *dead) +{ + int status; + + if (kill(dead->pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", + dead->pid); + goto err; + } + + if (waitpid(dead->pid, &status, 0) != dead->pid) { + pr_perror("Could not wait on temporary process with pid: %d", + dead->pid); + goto err; + } + + if (!WIFSIGNALED(status)) { + pr_err("Expected temporary process to be terminated by a signal\n"); + goto err; + } + + if (WTERMSIG(status) != SIGKILL) { + pr_err("Expected temporary process to be terminated by SIGKILL\n"); + goto err; + } + + mutex_lock(dead_pidfd_hash_lock); + hlist_del(&dead->hash); + mutex_unlock(dead_pidfd_hash_lock); + return 0; +err: + return -1; +} + +static int open_one_pidfd(struct file_desc *d, int *new_fd) +{ + struct pidfd_info *info; + struct dead_pidfd *dead = NULL; + int pidfd; + + info = container_of(d, struct pidfd_info, d); + if (info->pidfe->nspid != -1) { + pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + goto out; + } + + dead = lookup_dead_pidfd(info->pidfe->ino); + BUG_ON(!dead); + + mutex_lock(&dead->pidfd_lock); + BUG_ON(dead->count == 0); + dead->count--; + if (dead->pid == -1) { + dead->pid = create_tmp_process(); + if (dead->pid < 0) { + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + } + + pidfd = pidfd_open(dead->pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + + if (dead->count == 0) { + if (free_dead_pidfd(dead)) { + pr_err("Failed to delete dead_pidfd struct\n"); + mutex_unlock(&dead->pidfd_lock); + close(pidfd); + goto err_close; + } + } + mutex_unlock(&dead->pidfd_lock); + +out: + if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { + goto err_close; + } + + *new_fd = pidfd; + return 0; +err_close: + pr_err("Can't create pidfd %#08x NSpid: %d flags: %u\n", + info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); + return -1; +} + +static struct file_desc_ops pidfd_desc_ops = { + .type = FD_TYPES__PIDFD, + .open = open_one_pidfd +}; + +static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct dead_pidfd *dead; + struct pidfd_info *info = obj; + + info->pidfe = pb_msg(msg, PidfdEntry); + pr_info_pidfd("Collected ", info->pidfe); + + if (info->pidfe->nspid != -1) + goto out; + + dead = lookup_dead_pidfd(info->pidfe->ino); + if (dead) { + mutex_lock(&dead->pidfd_lock); + dead->count++; + mutex_unlock(&dead->pidfd_lock); + goto out; + } + + dead = shmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate shared memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->ino = info->pidfe->ino; + dead->count = 1; + dead->pid = -1; + mutex_init(&dead->pidfd_lock); + + mutex_lock(dead_pidfd_hash_lock); + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); + mutex_unlock(dead_pidfd_hash_lock); +out: + return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); +} + +struct collect_image_info pidfd_cinfo = { + .fd_type = CR_FD_PIDFD, + .pb_type = PB_PIDFD, + .priv_size = sizeof(struct pidfd_info), + .collect = collect_one_pidfd, +}; diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 55aefac7d7..95ebe3a411 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,10 +42,12 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "pidfd.pb-c.h" #include "plugin.h" #include @@ -2165,6 +2167,33 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { + struct pidfd_dump_info *pidfd_info = arg; + + if (type != FD_TYPES__PIDFD) + continue; + + if (fdinfo_field(str, "ino")) { + ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "Pid")) { + ret = sscanf(str, "%*s %d", &pidfd_info->pid); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "NSpid")) { + char *last; + + last = strrchr(str, '\t'); + if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto parse_err; + } + } + entry_met = true; continue; } diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5be..e0dbfccc21 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; From 005a33198640a411295eb965e4e56b8bcc0b95bb Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Tue, 9 Jul 2024 19:58:29 +0530 Subject: [PATCH 310/321] zdtm: Check pidfd fdinfo entry is consistent Ensures that entries in /proc//fdinfo/ are same. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_self.c | 140 ++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 test/zdtm/static/pidfd_self.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 1e891f0ba4..a2e852d73c 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -53,6 +53,7 @@ TST_NOFILE := \ shm \ shm-mp \ ptrace_sig \ + pidfd_self \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_self.c b/test/zdtm/static/pidfd_self.c new file mode 100644 index 0000000000..2730ee123d --- /dev/null +++ b/test/zdtm/static/pidfd_self.c @@ -0,0 +1,140 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check pidfd /proc/self/fdinfo/ entry remains consistent after checkpoint/restore\n"; +const char *test_author = "Bhavik Sachdev "; + +struct pidfd_status { + unsigned int flags; + pid_t pid; +}; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static void show_pidfd(char *prefix, struct pidfd_status *s) +{ + test_msg("\n\t%s\n\tflags: 0%o\n\tpid: %d\n", prefix, s->flags, s->pid); +} + +static int parse_self_fdinfo(int pidfd, struct pidfd_status *s) +{ + char buf[256]; + int ret = -1; + FILE *f; + + sprintf(buf, "/proc/self/fdinfo/%d", pidfd); + f = fopen(buf, "r"); + if (!f) { + perror("Can't open /proc/self/fdinfo/ to parse"); + return -1; + } + + memset(s, 0, sizeof(*s)); + + /* + * flags: file access mode (octal) 02000002 => [O_RDWR | O_CLOEXEC] + * pid: the pid to which we have pidfd open + */ + while (fgets(buf, sizeof(buf), f)) { + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "flags: 0%o", &s->flags) != 1) { + goto parse_err; + } + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "Pid: %d", &s->pid) != 1) + goto parse_err; + ret = 0; + break; + } + + if (ret) + goto parse_err; +err: + fclose(f); + return ret; + +parse_err: + pr_perror("Format error"); + goto err; +} + +static int check_pidfd(int fd, struct pidfd_status *old) +{ + struct pidfd_status new; + + if (parse_self_fdinfo(fd, &new)) + return -1; + + show_pidfd("restored", &new); + + if (old->flags != new.flags || old->pid != new.pid) + return -1; + + return 0; +} + +int main(int argc, char* argv[]) +{ + struct pidfd_status old; + int pidfd, ret; + + test_init(argc, argv); + + pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + parse_self_fdinfo(pidfd, &old); + + show_pidfd("old", &old); + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = check_pidfd(pidfd, &old); + if (ret) { + fail(); + goto err; + } + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + fail(); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} From 4cec03aafdcf04e5ee25e4eb58e3abc13597296d Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Tue, 9 Jul 2024 20:01:00 +0530 Subject: [PATCH 311/321] zdtm: Check pidfd can send signal after C/R Ensure `pidfd_send_signal()` syscall works as expected after C/R. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_child.c | 66 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 test/zdtm/static/pidfd_child.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index a2e852d73c..0268ae4927 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_child \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_child.c b/test/zdtm/static/pidfd_child.c new file mode 100644 index 0000000000..ec559605dc --- /dev/null +++ b/test/zdtm/static/pidfd_child.c @@ -0,0 +1,66 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks pidfd sends signal to child process after restore\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + int pidfd, status; + pid_t child; + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("Unable to fork a new process"); + return 1; + } else if (child == 0) { + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + fail("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + goto err_close; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + goto err_close; + } + + pass(); + close(pidfd); + return 0; +err_close: + close(pidfd); + return 1; +} From 6bf8ab1faadeb3f5d905222ed8e254edcd42f697 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Mon, 8 Jul 2024 22:25:00 +0530 Subject: [PATCH 312/321] zdtm: Check pidfd can kill descendant processes Validate that pidfds can been used to send signals to different processes after C/R using the `pidfd_send_signal()` syscall. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_kill.c | 128 ++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 test/zdtm/static/pidfd_kill.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 0268ae4927..ab45b580af 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -55,6 +55,7 @@ TST_NOFILE := \ ptrace_sig \ pidfd_self \ pidfd_child \ + pidfd_kill \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_kill.c b/test/zdtm/static/pidfd_kill.c new file mode 100644 index 0000000000..6232d033aa --- /dev/null +++ b/test/zdtm/static/pidfd_kill.c @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Kill child and grandchild process using pidfds\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int wait_for_child(int child) +{ + int status; + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + test_msg("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, gchild, cpidfd, gpidfd, gchild_pid, ret; + int p[2]; + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + gchild = fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } + + if (gchild == 0) { + test_waitsig(); + return 0; + } + + close(p[READ]); + if (write(p[WRITE], &gchild, sizeof(gchild)) + != sizeof(gchild)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + test_waitsig(); + return wait_for_child(gchild); + } + + cpidfd = pidfd_open(child, 0); + if (cpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &gchild_pid, sizeof(gchild_pid)) + != sizeof(gchild_pid)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + + gpidfd = pidfd_open(gchild_pid, 0); + if (gpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(gpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + if (pidfd_send_signal(cpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + ret = wait_for_child(child); + if (ret) + goto fail_close; + + pass(); + close(cpidfd); + close(gpidfd); + return 0; + +fail_close: + fail(); + close(cpidfd); + close(gpidfd); + return 1; +} From 69a617900e7b4b07e6f09131a785bdc7712c2ed9 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 25 Jul 2024 01:12:36 +0530 Subject: [PATCH 313/321] zdtm: Check dead pidfd is restored correctly After, C/R of pidfds that point to dead processes their inodes might change. But if two pidfds point to same dead process they should continue to do so after C/R. This test ensures that this happens by calling `statx()` on pidfds after C/R and then comparing their inode numbers. Support for comparing pidfds by using `statx()` and inode numbers was introduced alongside pidfs. So if `f_type` of pidfd is not equal to `PID_FS_MAGIC` then we skip this test. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_dead.c | 244 ++++++++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 test/zdtm/static/pidfd_dead.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ab45b580af..20e4bc2721 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_dead \ pidfd_child \ pidfd_kill \ pipe00 \ diff --git a/test/zdtm/static/pidfd_dead.c b/test/zdtm/static/pidfd_dead.c new file mode 100644 index 0000000000..9c825899d1 --- /dev/null +++ b/test/zdtm/static/pidfd_dead.c @@ -0,0 +1,244 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of pidfds that point to dead processes\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main opens a pidfd for both child and grandchild. + * Before C/R we kill both child and grandchild. + * We end up with two unique dead pidfds. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int open_pidfd_pair(int pidfd[2], int pid) +{ + pidfd[0] = pidfd_open(pid, 0); + if (pidfd[0] < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + pidfd[1] = pidfd_open(pid, 0); + if (pidfd[1] < 0) { + close(pidfd[0]); + pr_perror("pidfd_open() failed"); + return 1; + } + return 0; +} + +static int compare_pidfds(int pidfd[2]) +{ + /* + * After linux 6.9 we can compare inode numbers + * to determine if two pidfds point to the same process. + * While the inode number may change before and after C/R + * pidfds pointing to the same pid should have the same inode number. + */ + struct statx stats[2]; + statx(pidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(pidfd[1], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino != stats[1].stx_ino) + return 1; + return 0; +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, ret, gchild, p[2], status; + int cpidfd[2], gpidfd[2]; + struct statx stats[2]; + + test_init(argc, argv); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild = test_fork(); + close(p[READ]); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[WRITE]); + while(1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + + return 0; + } + } + + ret = open_pidfd_pair(cpidfd, child); + if (ret) + return 1; + + close(p[WRITE]); + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[READ]); + + ret = open_pidfd_pair(gpidfd, gchild); + if (ret) + return 1; + + /* + * We kill grandchild and child processes only after opening pidfds. + */ + if (pidfd_send_signal(gpidfd[0], SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + goto fail_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto fail_close; + } + + if (!WIFEXITED(status)) { + fail("Expected child to exit normally"); + goto fail_close; + } + + if (WEXITSTATUS(status) != 0) { + fail("Expected child to exit with 0"); + goto fail_close; + } + usleep(1000); + + if (kill(gchild, 0) != -1 && errno != ESRCH) { + fail("Expected grand child to not exist"); + goto fail_close; + } + + if (kill(child, 0) != -1 && errno != ESRCH) { + fail("Expected child to not exist"); + goto fail_close; + } + + test_daemon(); + test_waitsig(); + + ret = compare_pidfds(cpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + ret = compare_pidfds(gpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + statx(cpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(gpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino == stats[1].stx_ino) { + fail("pidfds pointing to diff pids should have diff inodes"); + goto fail_close; + } + + pass(); + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 0; + +fail_close: + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 1; +} From bb1b1dc88f9d5638a7a7ce552fdfe51729cc09d6 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Fri, 16 Aug 2024 21:20:57 +0530 Subject: [PATCH 314/321] zdtm: Check fd from pidfd_getfd is C/Red correctly We get the read end of a pipe using `pidfd_getfd` and check if we can read from it after C/R. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/fd_from_pidfd.c | 108 +++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 test/zdtm/static/fd_from_pidfd.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 20e4bc2721..f4dbb1d96a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -57,6 +57,7 @@ TST_NOFILE := \ pidfd_dead \ pidfd_child \ pidfd_kill \ + fd_from_pidfd \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/fd_from_pidfd.c b/test/zdtm/static/fd_from_pidfd.c new file mode 100644 index 0000000000..1f863d6c0e --- /dev/null +++ b/test/zdtm/static/fd_from_pidfd.c @@ -0,0 +1,108 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if fd obtained from pidfd_get_fd is C/R correctly\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) +{ + return syscall(__NR_pidfd_getfd, pidfd, targetfd, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int pidfd, child, p[2], child_read, read_data, status; + int data = 42; + + test_init(argc, argv); + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + close(p[WRITE]); + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + close(p[READ]); + if (write(p[WRITE], &data, sizeof(data)) != sizeof(data)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + child_read = pidfd_getfd(pidfd, p[READ], 0); + if (child_read < 0) { + pr_perror("pidfd_getfd"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read(child_read, &read_data, sizeof(read_data)) != sizeof(read_data)) { + pr_perror("read"); + goto err_close; + } + + if (read_data != data) { + fail("data from fd obtained using pidfd_getfd incorrect"); + goto err_close; + } + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + pr_perror("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + close(child_read); + close(pidfd); + return 0; +err_close: + close(child_read); + close(pidfd); + return 1; +} From 56bc7396ad2a8e04b6d176e733bc0166c4df4fb6 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Mon, 26 Aug 2024 20:56:14 +0530 Subject: [PATCH 315/321] zdtm: Check pidfd for thread is valid after C/R We open a pidfd to a thread using `PIDFD_THREAD` flag and after C/R ensure that we can send signals using it with `PIDFD_SIGNAL_THREAD`. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_of_thread.c | 114 ++++++++++++++++++++++++++ test/zdtm/static/pidfd_of_thread.desc | 1 + 3 files changed, 116 insertions(+) create mode 100644 test/zdtm/static/pidfd_of_thread.c create mode 100644 test/zdtm/static/pidfd_of_thread.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index f4dbb1d96a..44ac64fe57 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_of_thread \ pidfd_dead \ pidfd_child \ pidfd_kill \ diff --git a/test/zdtm/static/pidfd_of_thread.c b/test/zdtm/static/pidfd_of_thread.c new file mode 100644 index 0000000000..d232c7ac1d --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check C/R of pidfds that point to threads\n"; +const char *test_author = "Bhavik Sachdev "; + +/* see also: https://codebrowser.dev/glibc/glibc/sysdeps/unix/sysv/linux/tst-clone3.c.html */ + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +#ifndef PIDFD_SIGNAL_THREAD +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#endif + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int thread_func(void *a) +{ + test_waitsig(); + return 0; +} + +#define CTID_INIT_VAL 1 + +int main(int argc, char* argv[]) +{ + char st[64 * 1024] __attribute__ ((aligned)); + pid_t tid; + int pidfd, test_pidfd; + futex_t exited; + + int clone_flags = CLONE_THREAD; + clone_flags |= CLONE_VM | CLONE_SIGHAND; + clone_flags |= CLONE_CHILD_CLEARTID; + + test_init(argc, argv); + + test_pidfd = pidfd_open(getpid(), 0); + if (test_pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + /* PIDFD_THREAD, PIDFD_SIGNAL_THREAD are supported only with pidfs */ + if (get_fs_type(test_pidfd) != PID_FS_MAGIC) { + test_daemon(); + test_waitsig(); + skip("pidfs not supported."); + close(test_pidfd); + return 0; + } + close(test_pidfd); + + futex_set(&exited, CTID_INIT_VAL); + + tid = clone(thread_func, st + sizeof(st), clone_flags, NULL, NULL, NULL, &(exited.raw)); + if (tid == -1) { + pr_perror("clone() failed"); + return 1; + } + + test_msg("Successfully created a thread with tid: %d\n", tid); + pidfd = pidfd_open(tid, PIDFD_THREAD); + if (pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, PIDFD_SIGNAL_THREAD)) { + pr_perror("pidfd_send_signal() failed"); + fail(); + close(pidfd); + return 1; + } + + test_msg("Waiting for thread to exit\n"); + futex_wait_until(&exited, 0); + + pass(); + close(pidfd); + return 0; +} diff --git a/test/zdtm/static/pidfd_of_thread.desc b/test/zdtm/static/pidfd_of_thread.desc new file mode 100644 index 0000000000..802caed655 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.desc @@ -0,0 +1 @@ +{'flags': 'noauto crfail'} From 900f94e8ec53b1bc52cc0f5d1dbfc18f80738c83 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 12:39:18 +0100 Subject: [PATCH 316/321] make/lint: use 'ruff check ' The command `ruff ` has been deprecated and removed: https://astral.sh/blog/ruff-v0.5.0#removed-deprecated-features Signed-off-by: Radostin Stoyanov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 172d4b5177..e98eed0599 100644 --- a/Makefile +++ b/Makefile @@ -447,7 +447,7 @@ help: ruff: @ruff --version - ruff ${RUFF_FLAGS} --config=scripts/ruff.toml \ + ruff check ${RUFF_FLAGS} --config=scripts/ruff.toml \ test/zdtm.py \ test/inhfd/*.py \ test/others/rpc/config_file.py \ From 59afbf3438252f281974579ecd86b78b7536cc61 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 13:58:41 +0100 Subject: [PATCH 317/321] pycriu: fix lint errors This patch fixes the following errors reported by ruff: lib/pycriu/images/pb2dict.py:307:24: E721 Use `is` and `is not` for type comparisons, or `isinstance()` for isinstance checks | 305 | elif field.type in _basic_cast: 306 | cast = _basic_cast[field.type] 307 | if pretty and (cast == int): | ^^^^^^^^^^^ E721 308 | if is_hex: 309 | # Fields that have (criu).hex = true option set | lib/pycriu/images/pb2dict.py:379:13: E721 Use `is` and `is not` for type comparisons, or `isinstance()` for isinstance checks | 377 | elif field.type in _basic_cast: 378 | cast = _basic_cast[field.type] 379 | if (cast == int) and is_string(value): | ^^^^^^^^^^^ E721 380 | if _marked_as_dev(field): 381 | return encode_dev(field, value) | Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 0d1a246927..e3dd95ac0a 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -304,7 +304,7 @@ def _pb2dict_cast(field, value, pretty=False, is_hex=False): return field.enum_type.values_by_number.get(value, None).name elif field.type in _basic_cast: cast = _basic_cast[field.type] - if pretty and (cast == int): + if pretty and cast is int: if is_hex: # Fields that have (criu).hex = true option set # should be stored in hex string format. @@ -376,7 +376,7 @@ def _dict2pb_cast(field, value): return field.enum_type.values_by_name.get(value, None).number elif field.type in _basic_cast: cast = _basic_cast[field.type] - if (cast == int) and is_string(value): + if cast is int and is_string(value): if _marked_as_dev(field): return encode_dev(field, value) From 18f7207e26f608a70c5db151ec852470975f1c29 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 4 Oct 2024 12:14:29 +0100 Subject: [PATCH 318/321] images/inventory: add field for enabled plugins This patch extends the inventory image with a `plugins` field that contains an array of plugins which were used during checkpoint, for example, to save GPU state. In particular, the CUDA and AMDGPU plugins are added to this field only when the checkpoint contains GPU state. This allows to disable unnecessary plugins during restore, show appropriate error messages if required CRIU plugin are missing, and migrate a process that does not use GPU from a GPU-enabled system to CPU-only environment. We use the `optional plugins_entry` for backwards compatibility. This entry allows us to distinguish between *unset* and *missing* field: - When the field is missing, it indicates that the checkpoint was created with a previous version of CRIU, and all plugins should be *enabled* during restore. - When the field is empty, it indicates that no plugins were used during checkpointing. Thus, all plugins can be *disabled* during restore. Signed-off-by: Radostin Stoyanov --- criu/cr-restore.c | 6 +- criu/image.c | 124 +++++++++++++++++++++++++++++++++ criu/include/image.h | 4 ++ criu/plugin.c | 3 + images/inventory.proto | 8 +++ plugins/amdgpu/amdgpu_plugin.c | 31 +++++++++ plugins/cuda/cuda_plugin.c | 22 +++++- 7 files changed, 193 insertions(+), 5 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d5b6c8037a..646300bdb8 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2354,12 +2354,12 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) - return -1; - if (check_img_inventory(/* restore = */ true) < 0) goto err; + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; + if (init_stats(RESTORE_STATS)) goto err; diff --git a/criu/image.c b/criu/image.c index 9fb390ab7e..9589167fb1 100644 --- a/criu/image.c +++ b/criu/image.c @@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +struct inventory_plugin { + struct list_head node; + char *name; +}; + +struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list); +static int n_inventory_plugins; + int check_img_inventory(bool restore) { int ret = -1; @@ -99,6 +107,19 @@ int check_img_inventory(bool restore) } else { opts.network_lock_method = he->network_lock_method; } + + if (!he->plugins_entry) { + /* backwards compatibility: if the 'plugins_entry' field is missing, + * all plugins should be enabled during restore. + */ + n_inventory_plugins = -1; + } else { + PluginsEntry *pe = he->plugins_entry; + for (int i = 0; i < pe->n_plugins; i++) { + if (add_inventory_plugin(pe->plugins[i])) + goto out_err; + } + } } ret = 0; @@ -110,8 +131,92 @@ int check_img_inventory(bool restore) return ret; } +/** + * Check if the 'plugins' field in the inventory image contains + * the specified plugin name. If found, the plugin is removed + * from the linked list. + */ +bool check_and_remove_inventory_plugin(const char *name, size_t n) +{ + if (n_inventory_plugins == -1) + return true; /* backwards compatibility */ + + if (n_inventory_plugins > 0) { + struct inventory_plugin *p, *tmp; + + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + if (!strncmp(name, p->name, n)) { + xfree(p->name); + list_del(&p->node); + xfree(p); + n_inventory_plugins--; + return true; + } + } + } + + return false; +} + +/** + * We expect during restore all loaded plugins to be removed from + * the inventory_plugins_list. If the list is not empty, show an + * error message for each missing plugin. + */ +int check_inventory_plugins(void) +{ + struct inventory_plugin *p; + + if (n_inventory_plugins <= 0) + return 0; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pr_err("Missing required plugin: %s\n", p->name); + } + + return -1; +} + +/** + * Add plugin name to the inventory image. These values + * can be used to identify required plugins during restore. + */ +int add_inventory_plugin(const char *name) +{ + struct inventory_plugin *p; + + p = xmalloc(sizeof(struct inventory_plugin)); + if (p == NULL) + return -1; + + p->name = xstrdup(name); + if (!p->name) { + xfree(p); + return -1; + } + list_add(&p->node, &inventory_plugins_list); + n_inventory_plugins++; + + return 0; +} + +void free_inventory_plugins_list(void) +{ + struct inventory_plugin *p, *tmp; + + if (!list_empty(&inventory_plugins_list)) { + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + xfree(p->name); + list_del(&p->node); + xfree(p); + } + } + n_inventory_plugins = 0; +} + int write_img_inventory(InventoryEntry *he) { + PluginsEntry pe = PLUGINS_ENTRY__INIT; struct cr_img *img; int ret; @@ -121,8 +226,27 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; + if (!list_empty(&inventory_plugins_list)) { + struct inventory_plugin *p; + int i = 0; + + pe.n_plugins = n_inventory_plugins; + pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *)); + if (!pe.plugins) + return -1; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pe.plugins[i] = p->name; + i++; + } + } + he->plugins_entry = &pe; + ret = pb_write_one(img, he, PB_INVENTORY); + free_inventory_plugins_list(); + xfree(pe.plugins); + xfree(he->root_ids); close_image(img); if (ret < 0) diff --git a/criu/include/image.h b/criu/include/image.h index a17aae35c2..afa7d5e12f 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -177,4 +177,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); +extern int add_inventory_plugin(const char *name); +extern int check_inventory_plugins(void); +extern bool check_and_remove_inventory_plugin(const char *name, size_t n); + #endif /* __CR_IMAGE_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 58b5ea5bfe..65e79a0692 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -256,6 +256,9 @@ int cr_plugin_init(int stage) goto err; } + if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins()) + goto err; + exit_code = 0; err: closedir(d); diff --git a/images/inventory.proto b/images/inventory.proto index a735bad1d0..7f655031bc 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -10,6 +10,13 @@ enum lsmtype { APPARMOR = 2; } +// It is not possible to distinguish between an empty repeated field +// and unset repeated field. To solve this problem and provide backwards +// compabibility, we use the 'plugins_entry' message. +message plugins_entry { + repeated string plugins = 12; +}; + message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; @@ -21,4 +28,5 @@ message inventory_entry { optional uint32 pre_dump_mode = 9; optional bool tcp_close = 10; optional uint32 network_lock_method = 11; + optional plugins_entry plugins_entry = 12; } diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index b56ba6d140..96c0861628 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -60,6 +60,10 @@ static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; +bool plugin_added_to_inventory = false; + +bool plugin_disabled = false; + /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -332,6 +336,13 @@ void getenv_size_t(const char *var, size_t *value) int amdgpu_plugin_init(int stage) { + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); @@ -365,6 +376,9 @@ int amdgpu_plugin_init(int stage) void amdgpu_plugin_fini(int stage, int ret) { + if (plugin_disabled) + return; + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) @@ -414,6 +428,14 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) if (ret) pr_perror("%s(), Can't handle VMAs of input device", __func__); + if (!ret && !plugin_added_to_inventory) { + ret = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (ret) + pr_err("Failed to add AMDGPU plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -1540,6 +1562,9 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; + if (plugin_disabled) + return -ENOTSUP; + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1746,6 +1771,9 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; + if (plugin_disabled) + return -ENOTSUP; + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1805,6 +1833,9 @@ int amdgpu_plugin_resume_devices_late(int target_pid) struct kfd_ioctl_criu_args args = { 0 }; int fd, exit_code = 0; + if (plugin_disabled) + return -ENOTSUP; + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 23c3f4b1ab..c4fc67fa9f 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -38,6 +38,8 @@ */ bool plugin_disabled = false; +bool plugin_added_to_inventory = false; + struct pid_info { int pid; char checkpointed; @@ -319,7 +321,7 @@ int cuda_plugin_checkpoint_devices(int pid) k_rtsigset_t save_sigset; if (plugin_disabled) { - return 0; + return -ENOTSUP; } restore_tid = get_cuda_restore_tid(pid); @@ -354,6 +356,15 @@ int cuda_plugin_checkpoint_devices(int pid) pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); } } + + if (!status && !plugin_added_to_inventory) { + status = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (status) + pr_err("Failed to add CUDA plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); @@ -367,7 +378,7 @@ int cuda_plugin_pause_devices(int pid) char msg_buf[CUDA_CKPT_BUF_SIZE]; if (plugin_disabled) { - return 0; + return -ENOTSUP; } restore_tid = get_cuda_restore_tid(pid); @@ -463,6 +474,13 @@ int cuda_plugin_init(int stage) { int ret; + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); plugin_disabled = true; From f5d59ecade20441e7797cd042298f0777e62b958 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 13:36:22 +0100 Subject: [PATCH 319/321] zdtm: add inventory test plugins This patch adds two test plugins to verify that CRIU plugins listed in the inventory image are enabled, while those that are not listed can be disabled. Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 1 + test/plugins/Makefile | 16 +++++++++++++++- test/plugins/inventory_test_disabled_plugin.c | 17 +++++++++++++++++ test/plugins/inventory_test_enabled_plugin.c | 17 +++++++++++++++++ test/zdtm.py | 2 +- 5 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 test/plugins/inventory_test_disabled_plugin.c create mode 100644 test/plugins/inventory_test_enabled_plugin.c diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 38b7b5097f..b472e954c2 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -362,5 +362,6 @@ make -C plugins/amdgpu/ test_topology_remap ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled ./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 diff --git a/test/plugins/Makefile b/test/plugins/Makefile index 7827b655c4..4f620ad503 100644 --- a/test/plugins/Makefile +++ b/test/plugins/Makefile @@ -1,5 +1,13 @@ SRC_DIR := ../../plugins -PLUGIN_TARGETS := amdgpu_plugin.so cuda_plugin.so +PLUGIN_TARGETS := inventory_test_enabled_plugin.so inventory_test_disabled_plugin.so amdgpu_plugin.so cuda_plugin.so + +ARCH := x86 + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC # Silent make rules. Q := @ @@ -12,6 +20,12 @@ amdgpu_plugin.so: $(SRC_DIR)/amdgpu/amdgpu_plugin.so cuda_plugin.so: $(SRC_DIR)/cuda/cuda_plugin.so $(Q) cp $< $@ +inventory_test_enabled_plugin.so: inventory_test_enabled_plugin.c + $(Q) $(CC) $(PLUGIN_CFLAGS) $< -o $@ $(PLUGIN_INCLUDE) + +inventory_test_disabled_plugin.so: inventory_test_disabled_plugin.c + $(Q) $(CC) $(PLUGIN_CFLAGS) $< -o $@ $(PLUGIN_INCLUDE) + clean: $(Q) $(RM) $(PLUGIN_TARGETS) diff --git a/test/plugins/inventory_test_disabled_plugin.c b/test/plugins/inventory_test_disabled_plugin.c new file mode 100644 index 0000000000..468fe924b1 --- /dev/null +++ b/test/plugins/inventory_test_disabled_plugin.c @@ -0,0 +1,17 @@ +#include "criu-plugin.h" +#include "image.h" + +int inventory_test_disabled_plugin_init(int stage) +{ + if (stage == CR_PLUGIN_STAGE__RESTORE) + return check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)); + + return 0; +} + +void inventory_test_disabled_plugin_fini(int stage, int ret) +{ + return; +} + +CR_PLUGIN_REGISTER("inventory_test_disabled_plugin", inventory_test_disabled_plugin_init, inventory_test_disabled_plugin_fini) \ No newline at end of file diff --git a/test/plugins/inventory_test_enabled_plugin.c b/test/plugins/inventory_test_enabled_plugin.c new file mode 100644 index 0000000000..89e684e2ac --- /dev/null +++ b/test/plugins/inventory_test_enabled_plugin.c @@ -0,0 +1,17 @@ +#include "criu-plugin.h" +#include "image.h" + +int inventory_test_enabled_plugin_init(int stage) +{ + if (stage == CR_PLUGIN_STAGE__RESTORE) + return !check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)); + + return add_inventory_plugin(CR_PLUGIN_DESC.name); +} + +void inventory_test_enabled_plugin_fini(int stage, int ret) +{ + return; +} + +CR_PLUGIN_REGISTER("inventory_test_enabled_plugin", inventory_test_enabled_plugin_init, inventory_test_enabled_plugin_fini) \ No newline at end of file diff --git a/test/zdtm.py b/test/zdtm.py index 6b2132cc30..37ebe63b7b 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2877,7 +2877,7 @@ def get_cli_args(): rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") rp.add_argument("--criu-plugin", help="Run tests with CRIU plugin", - choices=['amdgpu', 'cuda'], + choices=['amdgpu', 'cuda', 'inventory_test_enabled', 'inventory_test_disabled'], nargs='+', default=None) rp.add_argument("--mocked-cuda-checkpoint", From dfb56eed62f2b1dd7eaf830f44ae83eca6a1dc46 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Wed, 9 Oct 2024 09:50:28 +0100 Subject: [PATCH 320/321] pidfd: block SIGCHLD during tmp process creation This patch blocks SIGCHLD during temporary process creation to prevent a race condition between kill() and waitpid() where sigchld_handler() causes `criu restore` to fail with an error. Fixes: #2490 Signed-off-by: Bhavik Sachdev Signed-off-by: Radostin Stoyanov --- criu/pidfd.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/criu/pidfd.c b/criu/pidfd.c index fdf5dec60e..3ea3c93094 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -145,6 +145,20 @@ static int create_tmp_process(void) static int free_dead_pidfd(struct dead_pidfd *dead) { int status; + sigset_t blockmask, oldmask; + + /* + * Block SIGCHLD to prevent interfering from sigchld_handler() + * and to properly handle the tmp process termination without + * a race condition. A similar approach is used in cr_system(). + */ + sigemptyset(&oldmask); + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { + pr_perror("Cannot set mask of blocked signals"); + goto err; + } if (kill(dead->pid, SIGKILL) < 0) { pr_perror("Could not kill temporary process with pid: %d", @@ -158,6 +172,12 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } + /* Restore the original signal mask after tmp process has terminated */ + if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { + pr_perror("Cannot clear blocked signals"); + goto err; + } + if (!WIFSIGNALED(status)) { pr_err("Expected temporary process to be terminated by a signal\n"); goto err; From 202a7fb7e131e73a0c113bc9fb6b4a9c394f369a Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Fri, 18 Oct 2024 18:51:18 +0200 Subject: [PATCH 321/321] criu: Initialize util before service worker starts When restoring dumps in new mount + pid namespaces where multiple dumps share the same network namespace, CRIU may fail due to conflicting unix socket names. This happens because the service worker creates sockets using a pattern that includes criu_run_id, but util_init() is called after cr_service_work() starts. The socket naming pattern "crtools-fd-%d-%d" uses the restore PID and criu_run_id, however criu_run_id is always 0 when not initialized, leading to conflicts when multiple restores run simultaneously either in the same CRIU process or because of multiple CRIU processes doing the same operation in different PID namespaces. Fix this by: - Moving util_init() before cr_service_work() starts - Adding a second util_init() call in the service worker fork to ensure unique IDs across multiple worker runs - Making sure that dump and restore operations have util_init() called early to generate unique socket names With this fix, socket names always include the namespace ID, preventing conflicts when multiple processes with the same pid share a network namespace. Fixes #2499 Signed-off-by: Lorenzo Fontana --- criu/cr-service.c | 9 +++++++++ criu/crtools.c | 12 +++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 61a04c5ffe..adb5cedde3 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -1310,6 +1310,8 @@ int cr_service_work(int sk) int ret = -1; CriuReq *msg = 0; + util_init(); + more: opts.mode = CR_SWRK; @@ -1528,6 +1530,13 @@ int cr_service(bool daemon_mode) close(server_fd); init_opts(); + /* + * We want to have an unique criu_run_id + * here so that each service worker fork here + * can create its own sockets file descriptors + * despite being in the same network namespace. + */ + util_init(); ret = cr_service_work(sk); close(sk); exit(ret != 0); diff --git a/criu/crtools.c b/criu/crtools.c index 94657f4186..b67af0b72e 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -169,7 +169,15 @@ int main(int argc, char *argv[], char *envp[]) pr_err("unknown command: %s\n", argv[optind]); goto usage; } - + /* + * During dump, restore and parasite it's important for us + * to initialize criu_run_id and compel_run_id so that + * sockets and file descriptors are generated with an unique + * name identifying the specific process even in cases + * where multiple processes with the same pid in different + * pid namespaces are sharing the same network namespace. + */ + util_init(); if (opts.mode == CR_SWRK) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); @@ -254,8 +262,6 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - util_init(); - if (log_init(opts.output)) return 1;