diff options
Diffstat (limited to 'system/xen/xsa')
38 files changed, 0 insertions, 4447 deletions
diff --git a/system/xen/xsa/xsa317.patch b/system/xen/xsa/xsa317.patch deleted file mode 100644 index 20e2c643d0..0000000000 --- a/system/xen/xsa/xsa317.patch +++ /dev/null @@ -1,50 +0,0 @@ -From aeb46e92f915f19a61d5a8a1f4b696793f64e6fb Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Thu, 19 Mar 2020 13:17:31 +0000 -Subject: [PATCH] xen/common: event_channel: Don't ignore error in - get_free_port() - -Currently, get_free_port() is assuming that the port has been allocated -when evtchn_allocate_port() is not return -EBUSY. - -However, the function may return an error when: - - We exhausted all the event channels. This can happen if the limit - configured by the administrator for the guest ('max_event_channels' - in xl cfg) is higher than the ABI used by the guest. For instance, - if the guest is using 2L, the limit should not be higher than 4095. - - We cannot allocate memory (e.g Xen has not more memory). - -Users of get_free_port() (such as EVTCHNOP_alloc_unbound) will validly -assuming the port was valid and will next call evtchn_from_port(). This -will result to a crash as the memory backing the event channel structure -is not present. - -Fixes: 368ae9a05fe ("xen/pvshim: forward evtchn ops between L0 Xen and L2 DomU") -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/common/event_channel.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c -index e86e2bfab0..a8d182b584 100644 ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -195,10 +195,10 @@ static int get_free_port(struct domain *d) - { - int rc = evtchn_allocate_port(d, port); - -- if ( rc == -EBUSY ) -- continue; -- -- return port; -+ if ( rc == 0 ) -+ return port; -+ else if ( rc != -EBUSY ) -+ return rc; - } - - return -ENOSPC; --- -2.17.1 - diff --git a/system/xen/xsa/xsa319.patch b/system/xen/xsa/xsa319.patch deleted file mode 100644 index 769443c900..0000000000 --- a/system/xen/xsa/xsa319.patch +++ /dev/null @@ -1,27 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: x86/shadow: correct an inverted conditional in dirty VRAM tracking - -This originally was "mfn_x(mfn) == INVALID_MFN". Make it like this -again, taking the opportunity to also drop the unnecessary nearby -braces. - -This is XSA-319. - -Fixes: 246a5a3377c2 ("xen: Use a typesafe to define INVALID_MFN") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> - ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -3252,10 +3252,8 @@ int shadow_track_dirty_vram(struct domai - int dirty = 0; - paddr_t sl1ma = dirty_vram->sl1ma[i]; - -- if ( !mfn_eq(mfn, INVALID_MFN) ) -- { -+ if ( mfn_eq(mfn, INVALID_MFN) ) - dirty = 1; -- } - else - { - page = mfn_to_page(mfn); diff --git a/system/xen/xsa/xsa320-4.13-1.patch b/system/xen/xsa/xsa320-4.13-1.patch deleted file mode 100644 index 09eb8ea98e..0000000000 --- a/system/xen/xsa/xsa320-4.13-1.patch +++ /dev/null @@ -1,117 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/spec-ctrl: CPUID/MSR definitions for Special Register Buffer Data Sampling - -This is part of XSA-320 / CVE-2020-0543 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Wei Liu <wl@xen.org> - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 1d9d816622..9268454297 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -483,10 +483,10 @@ accounting for hardware capabilities as enumerated via CPUID. - - Currently accepted: - --The Speculation Control hardware features `md-clear`, `ibrsb`, `stibp`, `ibpb`, --`l1d-flush` and `ssbd` are used by default if available and applicable. They can --be ignored, e.g. `no-ibrsb`, at which point Xen won't use them itself, and --won't offer them to guests. -+The Speculation Control hardware features `srbds-ctrl`, `md-clear`, `ibrsb`, -+`stibp`, `ibpb`, `l1d-flush` and `ssbd` are used by default if available and -+applicable. They can be ignored, e.g. `no-ibrsb`, at which point Xen won't -+use them itself, and won't offer them to guests. - - ### cpuid_mask_cpu - > `= fam_0f_rev_[cdefg] | fam_10_rev_[bc] | fam_11_rev_b` -diff --git a/tools/libxl/libxl_cpuid.c b/tools/libxl/libxl_cpuid.c -index 6cea4227ba..a78f08b927 100644 ---- a/tools/libxl/libxl_cpuid.c -+++ b/tools/libxl/libxl_cpuid.c -@@ -213,6 +213,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) - - {"avx512-4vnniw",0x00000007, 0, CPUID_REG_EDX, 2, 1}, - {"avx512-4fmaps",0x00000007, 0, CPUID_REG_EDX, 3, 1}, -+ {"srbds-ctrl", 0x00000007, 0, CPUID_REG_EDX, 9, 1}, - {"md-clear", 0x00000007, 0, CPUID_REG_EDX, 10, 1}, - {"cet-ibt", 0x00000007, 0, CPUID_REG_EDX, 20, 1}, - {"ibrsb", 0x00000007, 0, CPUID_REG_EDX, 26, 1}, -diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c -index 603e1d65fd..a09440813b 100644 ---- a/tools/misc/xen-cpuid.c -+++ b/tools/misc/xen-cpuid.c -@@ -157,6 +157,7 @@ static const char *const str_7d0[32] = - [ 2] = "avx512_4vnniw", [ 3] = "avx512_4fmaps", - [ 4] = "fsrm", - -+ /* 8 */ [ 9] = "srbds-ctrl", - [10] = "md-clear", - /* 12 */ [13] = "tsx-force-abort", - -diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c -index 4b12103482..0cded3c0ad 100644 ---- a/xen/arch/x86/msr.c -+++ b/xen/arch/x86/msr.c -@@ -134,6 +134,7 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val) - /* Write-only */ - case MSR_TSX_FORCE_ABORT: - case MSR_TSX_CTRL: -+ case MSR_MCU_OPT_CTRL: - case MSR_U_CET: - case MSR_S_CET: - case MSR_PL0_SSP ... MSR_INTERRUPT_SSP_TABLE: -@@ -288,6 +289,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) - /* Read-only */ - case MSR_TSX_FORCE_ABORT: - case MSR_TSX_CTRL: -+ case MSR_MCU_OPT_CTRL: - case MSR_U_CET: - case MSR_S_CET: - case MSR_PL0_SSP ... MSR_INTERRUPT_SSP_TABLE: -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 6656c44aec..5fc1c6827e 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -312,12 +312,13 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - printk("Speculative mitigation facilities:\n"); - - /* Hardware features which pertain to speculative mitigations. */ -- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", - (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", - (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", - (_7d0 & cpufeat_mask(X86_FEATURE_SSBD)) ? " SSBD" : "", - (_7d0 & cpufeat_mask(X86_FEATURE_MD_CLEAR)) ? " MD_CLEAR" : "", -+ (_7d0 & cpufeat_mask(X86_FEATURE_SRBDS_CTRL)) ? " SRBDS_CTRL" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ? " IBPB" : "", - (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index 7693c4a71a..91994669e1 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -179,6 +179,9 @@ - #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x490 - #define MSR_IA32_VMX_VMFUNC 0x491 - -+#define MSR_MCU_OPT_CTRL 0x00000123 -+#define MCU_OPT_CTRL_RNGDS_MITG_DIS (_AC(1, ULL) << 0) -+ - #define MSR_U_CET 0x000006a0 - #define MSR_S_CET 0x000006a2 - #define MSR_PL0_SSP 0x000006a4 -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index 2835688f1c..a2482c3627 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -252,6 +252,7 @@ XEN_CPUFEATURE(IBPB, 8*32+12) /*A IBPB support only (no IBRS, used by - /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ - XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ - XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single Precision */ -+XEN_CPUFEATURE(SRBDS_CTRL, 9*32+ 9) /* MSR_MCU_OPT_CTRL and RNGDS_MITG_DIS. */ - XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*A VERW clears microarchitectural buffers */ - XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ - XEN_CPUFEATURE(CET_IBT, 9*32+20) /* CET - Indirect Branch Tracking */ diff --git a/system/xen/xsa/xsa320-4.13-2.patch b/system/xen/xsa/xsa320-4.13-2.patch deleted file mode 100644 index 8a8080a312..0000000000 --- a/system/xen/xsa/xsa320-4.13-2.patch +++ /dev/null @@ -1,179 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/spec-ctrl: Mitigate the Special Register Buffer Data Sampling sidechannel - -See patch documentation and comments. - -This is part of XSA-320 / CVE-2020-0543 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 9268454297..c780312531 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -1991,7 +1991,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - ### spec-ctrl (x86) - > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, --> l1d-flush,branch-harden}=<bool> ]` -+> l1d-flush,branch-harden,srb-lock}=<bool> ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2068,6 +2068,12 @@ If Xen is compiled with `CONFIG_SPECULATIVE_HARDEN_BRANCH`, the - speculation barriers to protect selected conditional branches. By default, - Xen will enable this mitigation. - -+On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force -+or prevent Xen from protect the Special Register Buffer from leaking stale -+data. By default, Xen will enable this mitigation, except on parts where MDS -+is fixed and TAA is fixed/mitigated (in which case, there is believed to be no -+way for an attacker to obtain the stale data). -+ - ### sync_console - > `= <boolean>` - -diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c -index feb0f6ce20..75c6e34164 100644 ---- a/xen/arch/x86/acpi/power.c -+++ b/xen/arch/x86/acpi/power.c -@@ -295,6 +295,9 @@ static int enter_state(u32 state) - ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr); - spec_ctrl_exit_idle(ci); - -+ if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) ) -+ wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl); -+ - done: - spin_debug_enable(); - local_irq_restore(flags); -diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c -index dc8fdac1a1..b1e51b3aff 100644 ---- a/xen/arch/x86/smpboot.c -+++ b/xen/arch/x86/smpboot.c -@@ -361,12 +361,14 @@ void start_secondary(void *unused) - microcode_update_one(false); - - /* -- * If MSR_SPEC_CTRL is available, apply Xen's default setting and discard -- * any firmware settings. Note: MSR_SPEC_CTRL may only become available -- * after loading microcode. -+ * If any speculative control MSRs are available, apply Xen's default -+ * settings. Note: These MSRs may only become available after loading -+ * microcode. - */ - if ( boot_cpu_has(X86_FEATURE_IBRSB) ) - wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); -+ if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) ) -+ wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl); - - tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 5fc1c6827e..33343062a7 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -65,6 +65,9 @@ static unsigned int __initdata l1d_maxphysaddr; - static bool __initdata cpu_has_bug_msbds_only; /* => minimal HT impact. */ - static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. */ - -+static int8_t __initdata opt_srb_lock = -1; -+uint64_t __read_mostly default_xen_mcu_opt_ctrl; -+ - static int __init parse_spec_ctrl(const char *s) - { - const char *ss; -@@ -112,6 +115,7 @@ static int __init parse_spec_ctrl(const char *s) - opt_ssbd = false; - opt_l1d_flush = 0; - opt_branch_harden = false; -+ opt_srb_lock = 0; - } - else if ( val > 0 ) - rc = -EINVAL; -@@ -178,6 +182,8 @@ static int __init parse_spec_ctrl(const char *s) - opt_l1d_flush = val; - else if ( (val = parse_boolean("branch-harden", s, ss)) >= 0 ) - opt_branch_harden = val; -+ else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) -+ opt_srb_lock = val; - else - rc = -EINVAL; - -@@ -341,7 +347,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s%s\n", -+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s%s%s\n", - thunk == THUNK_NONE ? "N/A" : - thunk == THUNK_RETPOLINE ? "RETPOLINE" : - thunk == THUNK_LFENCE ? "LFENCE" : -@@ -352,6 +358,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", - !(caps & ARCH_CAPS_TSX_CTRL) ? "" : - (opt_tsx & 1) ? " TSX+" : " TSX-", -+ !boot_cpu_has(X86_FEATURE_SRBDS_CTRL) ? "" : -+ opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", - opt_ibpb ? " IBPB" : "", - opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "", -@@ -1149,6 +1157,34 @@ void __init init_speculation_mitigations(void) - tsx_init(); - } - -+ /* Calculate suitable defaults for MSR_MCU_OPT_CTRL */ -+ if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) ) -+ { -+ uint64_t val; -+ -+ rdmsrl(MSR_MCU_OPT_CTRL, val); -+ -+ /* -+ * On some SRBDS-affected hardware, it may be safe to relax srb-lock -+ * by default. -+ * -+ * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only way -+ * to access the Fill Buffer. If TSX isn't available (inc. SKU -+ * reasons on some models), or TSX is explicitly disabled, then there -+ * is no need for the extra overhead to protect RDRAND/RDSEED. -+ */ -+ if ( opt_srb_lock == -1 && -+ (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO && -+ (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && opt_tsx == 0)) ) -+ opt_srb_lock = 0; -+ -+ val &= ~MCU_OPT_CTRL_RNGDS_MITG_DIS; -+ if ( !opt_srb_lock ) -+ val |= MCU_OPT_CTRL_RNGDS_MITG_DIS; -+ -+ default_xen_mcu_opt_ctrl = val; -+ } -+ - print_details(thunk, caps); - - /* -@@ -1180,6 +1216,9 @@ void __init init_speculation_mitigations(void) - - wrmsrl(MSR_SPEC_CTRL, bsp_delay_spec_ctrl ? 0 : default_xen_spec_ctrl); - } -+ -+ if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) ) -+ wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl); - } - - static void __init __maybe_unused build_assertions(void) -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 9caecddfec..b252bb8631 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -54,6 +54,8 @@ extern int8_t opt_pv_l1tf_hwdom, opt_pv_l1tf_domu; - */ - extern paddr_t l1tf_addr_mask, l1tf_safe_maddr; - -+extern uint64_t default_xen_mcu_opt_ctrl; -+ - static inline void init_shadow_spec_ctrl_state(void) - { - struct cpu_info *info = get_cpu_info(); diff --git a/system/xen/xsa/xsa320-4.13-3.patch b/system/xen/xsa/xsa320-4.13-3.patch deleted file mode 100644 index b52eeb338a..0000000000 --- a/system/xen/xsa/xsa320-4.13-3.patch +++ /dev/null @@ -1,36 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/spec-ctrl: Update docs with SRBDS workaround - -RDRAND/RDSEED can be hidden using cpuid= to mitigate SRBDS if microcode -isn't available. - -This is part of XSA-320 / CVE-2020-0543. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index c780312531..81e12d053c 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -481,12 +481,18 @@ choice of `dom0-kernel` is deprecated and not supported by all Dom0 kernels. - This option allows for fine tuning of the facilities Xen will use, after - accounting for hardware capabilities as enumerated via CPUID. - -+Unless otherwise noted, options only have any effect in their negative form, -+to hide the named feature(s). Ignoring a feature using this mechanism will -+cause Xen not to use the feature, nor offer them as usable to guests. -+ - Currently accepted: - - The Speculation Control hardware features `srbds-ctrl`, `md-clear`, `ibrsb`, - `stibp`, `ibpb`, `l1d-flush` and `ssbd` are used by default if available and --applicable. They can be ignored, e.g. `no-ibrsb`, at which point Xen won't --use them itself, and won't offer them to guests. -+applicable. They can all be ignored. -+ -+`rdrand` and `rdseed` can be ignored, as a mitigation to XSA-320 / -+CVE-2020-0543. - - ### cpuid_mask_cpu - > `= fam_0f_rev_[cdefg] | fam_10_rev_[bc] | fam_11_rev_b` diff --git a/system/xen/xsa/xsa327.patch b/system/xen/xsa/xsa327.patch deleted file mode 100644 index 0541cfa0df..0000000000 --- a/system/xen/xsa/xsa327.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 030300ebbb86c40c12db038714479d746167c767 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 26 May 2020 18:31:33 +0100 -Subject: [PATCH] xen: Check the alignment of the offset pased via - VCPUOP_register_vcpu_info - -Currently a guest is able to register any guest physical address to use -for the vcpu_info structure as long as the structure can fits in the -rest of the frame. - -This means a guest can provide an address that is not aligned to the -natural alignment of the structure. - -On Arm 32-bit, unaligned access are completely forbidden by the -hypervisor. This will result to a data abort which is fatal. - -On Arm 64-bit, unaligned access are only forbidden when used for atomic -access. As the structure contains fields (such as evtchn_pending_self) -that are updated using atomic operations, any unaligned access will be -fatal as well. - -While the misalignment is only fatal on Arm, a generic check is added -as an x86 guest shouldn't sensibly pass an unaligned address (this -would result to a split lock). - -This is XSA-327. - -Reported-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> ---- - xen/common/domain.c | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/xen/common/domain.c b/xen/common/domain.c -index 7cc9526139a6..e9be05f1d05f 100644 ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -1227,10 +1227,20 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset) - void *mapping; - vcpu_info_t *new_info; - struct page_info *page; -+ unsigned int align; - - if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) ) - return -EINVAL; - -+#ifdef CONFIG_COMPAT -+ if ( has_32bit_shinfo(d) ) -+ align = alignof(new_info->compat); -+ else -+#endif -+ align = alignof(*new_info); -+ if ( offset & (align - 1) ) -+ return -EINVAL; -+ - if ( !mfn_eq(v->vcpu_info_mfn, INVALID_MFN) ) - return -EINVAL; - --- -2.17.1 - diff --git a/system/xen/xsa/xsa328-4.13-1.patch b/system/xen/xsa/xsa328-4.13-1.patch deleted file mode 100644 index 56e48de3e9..0000000000 --- a/system/xen/xsa/xsa328-4.13-1.patch +++ /dev/null @@ -1,118 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: x86/EPT: ept_set_middle_entry() related adjustments - -ept_split_super_page() wants to further modify the newly allocated -table, so have ept_set_middle_entry() return the mapped pointer rather -than tearing it down and then getting re-established right again. - -Similarly ept_next_level() wants to hand back a mapped pointer of -the next level page, so re-use the one established by -ept_set_middle_entry() in case that path was taken. - -Pull the setting of suppress_ve ahead of insertion into the higher level -table, and don't have ept_split_super_page() set the field a 2nd time. - -This is part of XSA-328. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/arch/x86/mm/p2m-ept.c -+++ b/xen/arch/x86/mm/p2m-ept.c -@@ -187,8 +187,9 @@ static void ept_p2m_type_to_flags(struct - #define GUEST_TABLE_SUPER_PAGE 2 - #define GUEST_TABLE_POD_PAGE 3 - --/* Fill in middle levels of ept table */ --static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry) -+/* Fill in middle level of ept table; return pointer to mapped new table. */ -+static ept_entry_t *ept_set_middle_entry(struct p2m_domain *p2m, -+ ept_entry_t *ept_entry) - { - mfn_t mfn; - ept_entry_t *table; -@@ -196,7 +197,12 @@ static int ept_set_middle_entry(struct p - - mfn = p2m_alloc_ptp(p2m, 0); - if ( mfn_eq(mfn, INVALID_MFN) ) -- return 0; -+ return NULL; -+ -+ table = map_domain_page(mfn); -+ -+ for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) -+ table[i].suppress_ve = 1; - - ept_entry->epte = 0; - ept_entry->mfn = mfn_x(mfn); -@@ -208,14 +214,7 @@ static int ept_set_middle_entry(struct p - - ept_entry->suppress_ve = 1; - -- table = map_domain_page(mfn); -- -- for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) -- table[i].suppress_ve = 1; -- -- unmap_domain_page(table); -- -- return 1; -+ return table; - } - - /* free ept sub tree behind an entry */ -@@ -253,10 +252,10 @@ static bool_t ept_split_super_page(struc - - ASSERT(is_epte_superpage(ept_entry)); - -- if ( !ept_set_middle_entry(p2m, &new_ept) ) -+ table = ept_set_middle_entry(p2m, &new_ept); -+ if ( !table ) - return 0; - -- table = map_domain_page(_mfn(new_ept.mfn)); - trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER); - - for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) -@@ -267,7 +266,6 @@ static bool_t ept_split_super_page(struc - epte->sp = (level > 1); - epte->mfn += i * trunk; - epte->snp = is_iommu_enabled(p2m->domain) && iommu_snoop; -- epte->suppress_ve = 1; - - ept_p2m_type_to_flags(p2m, epte, epte->sa_p2mt, epte->access); - -@@ -306,8 +304,7 @@ static int ept_next_level(struct p2m_dom - ept_entry_t **table, unsigned long *gfn_remainder, - int next_level) - { -- unsigned long mfn; -- ept_entry_t *ept_entry, e; -+ ept_entry_t *ept_entry, *next = NULL, e; - u32 shift, index; - - shift = next_level * EPT_TABLE_ORDER; -@@ -332,19 +329,17 @@ static int ept_next_level(struct p2m_dom - if ( read_only ) - return GUEST_TABLE_MAP_FAILED; - -- if ( !ept_set_middle_entry(p2m, ept_entry) ) -+ next = ept_set_middle_entry(p2m, ept_entry); -+ if ( !next ) - return GUEST_TABLE_MAP_FAILED; -- else -- e = atomic_read_ept_entry(ept_entry); /* Refresh */ -+ /* e is now stale and hence may not be used anymore below. */ - } -- - /* The only time sp would be set here is if we had hit a superpage */ -- if ( is_epte_superpage(&e) ) -+ else if ( is_epte_superpage(&e) ) - return GUEST_TABLE_SUPER_PAGE; - -- mfn = e.mfn; - unmap_domain_page(*table); -- *table = map_domain_page(_mfn(mfn)); -+ *table = next ?: map_domain_page(_mfn(e.mfn)); - *gfn_remainder &= (1UL << shift) - 1; - return GUEST_TABLE_NORMAL_PAGE; - } diff --git a/system/xen/xsa/xsa328-4.13-2.patch b/system/xen/xsa/xsa328-4.13-2.patch deleted file mode 100644 index c4f437f625..0000000000 --- a/system/xen/xsa/xsa328-4.13-2.patch +++ /dev/null @@ -1,48 +0,0 @@ -From: <security@xenproject.org> -Subject: x86/ept: atomically modify entries in ept_next_level - -ept_next_level was passing a live PTE pointer to ept_set_middle_entry, -which was then modified without taking into account that the PTE could -be part of a live EPT table. This wasn't a security issue because the -pages returned by p2m_alloc_ptp are zeroed, so adding such an entry -before actually initializing it didn't allow a guest to access -physical memory addresses it wasn't supposed to access. - -This is part of XSA-328. - -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/arch/x86/mm/p2m-ept.c -+++ b/xen/arch/x86/mm/p2m-ept.c -@@ -307,6 +307,8 @@ static int ept_next_level(struct p2m_dom - ept_entry_t *ept_entry, *next = NULL, e; - u32 shift, index; - -+ ASSERT(next_level); -+ - shift = next_level * EPT_TABLE_ORDER; - - index = *gfn_remainder >> shift; -@@ -323,16 +325,20 @@ static int ept_next_level(struct p2m_dom - - if ( !is_epte_present(&e) ) - { -+ int rc; -+ - if ( e.sa_p2mt == p2m_populate_on_demand ) - return GUEST_TABLE_POD_PAGE; - - if ( read_only ) - return GUEST_TABLE_MAP_FAILED; - -- next = ept_set_middle_entry(p2m, ept_entry); -+ next = ept_set_middle_entry(p2m, &e); - if ( !next ) - return GUEST_TABLE_MAP_FAILED; -- /* e is now stale and hence may not be used anymore below. */ -+ -+ rc = atomic_write_ept_entry(p2m, ept_entry, e, next_level); -+ ASSERT(rc == 0); - } - /* The only time sp would be set here is if we had hit a superpage */ - else if ( is_epte_superpage(&e) ) diff --git a/system/xen/xsa/xsa328-post-xsa321-4.13-1.patch b/system/xen/xsa/xsa328-post-xsa321-4.13-1.patch deleted file mode 100644 index 9a08ab240e..0000000000 --- a/system/xen/xsa/xsa328-post-xsa321-4.13-1.patch +++ /dev/null @@ -1,31 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: vtd: improve IOMMU TLB flush - -Do not limit PSI flushes to order 0 pages, in order to avoid doing a -full TLB flush if the passed in page has an order greater than 0 and -is aligned. Should increase the performance of IOMMU TLB flushes when -dealing with page orders greater than 0. - -This is part of XSA-321. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -570,13 +570,14 @@ static int __must_check iommu_flush_iotl - if ( iommu_domid == -1 ) - continue; - -- if ( page_count != 1 || dfn_eq(dfn, INVALID_DFN) ) -+ if ( !page_count || (page_count & (page_count - 1)) || -+ dfn_eq(dfn, INVALID_DFN) || !IS_ALIGNED(dfn_x(dfn), page_count) ) - rc = iommu_flush_iotlb_dsi(iommu, iommu_domid, - 0, flush_dev_iotlb); - else - rc = iommu_flush_iotlb_psi(iommu, iommu_domid, - dfn_to_daddr(dfn), -- PAGE_ORDER_4K, -+ get_order_from_pages(page_count), - !dma_old_pte_present, - flush_dev_iotlb); - diff --git a/system/xen/xsa/xsa328-post-xsa321-4.13-2.patch b/system/xen/xsa/xsa328-post-xsa321-4.13-2.patch deleted file mode 100644 index 1e48615f2b..0000000000 --- a/system/xen/xsa/xsa328-post-xsa321-4.13-2.patch +++ /dev/null @@ -1,175 +0,0 @@ -From: <security@xenproject.org> -Subject: vtd: prune (and rename) cache flush functions - -Rename __iommu_flush_cache to iommu_sync_cache and remove -iommu_flush_cache_page. Also remove the iommu_flush_cache_entry -wrapper and just use iommu_sync_cache instead. Note the _entry suffix -was meaningless as the wrapper was already taking a size parameter in -bytes. While there also constify the addr parameter. - -No functional change intended. - -This is part of XSA-321. - -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -43,8 +43,7 @@ void disable_qinval(struct vtd_iommu *io - int enable_intremap(struct vtd_iommu *iommu, int eim); - void disable_intremap(struct vtd_iommu *iommu); - --void iommu_flush_cache_entry(void *addr, unsigned int size); --void iommu_flush_cache_page(void *addr, unsigned long npages); -+void iommu_sync_cache(const void *addr, unsigned int size); - int iommu_alloc(struct acpi_drhd_unit *drhd); - void iommu_free(struct acpi_drhd_unit *drhd); - ---- a/xen/drivers/passthrough/vtd/intremap.c -+++ b/xen/drivers/passthrough/vtd/intremap.c -@@ -230,7 +230,7 @@ static void free_remap_entry(struct vtd_ - iremap_entries, iremap_entry); - - update_irte(iommu, iremap_entry, &new_ire, false); -- iommu_flush_cache_entry(iremap_entry, sizeof(*iremap_entry)); -+ iommu_sync_cache(iremap_entry, sizeof(*iremap_entry)); - iommu_flush_iec_index(iommu, 0, index); - - unmap_vtd_domain_page(iremap_entries); -@@ -406,7 +406,7 @@ static int ioapic_rte_to_remap_entry(str - } - - update_irte(iommu, iremap_entry, &new_ire, !init); -- iommu_flush_cache_entry(iremap_entry, sizeof(*iremap_entry)); -+ iommu_sync_cache(iremap_entry, sizeof(*iremap_entry)); - iommu_flush_iec_index(iommu, 0, index); - - unmap_vtd_domain_page(iremap_entries); -@@ -695,7 +695,7 @@ static int msi_msg_to_remap_entry( - update_irte(iommu, iremap_entry, &new_ire, msi_desc->irte_initialized); - msi_desc->irte_initialized = true; - -- iommu_flush_cache_entry(iremap_entry, sizeof(*iremap_entry)); -+ iommu_sync_cache(iremap_entry, sizeof(*iremap_entry)); - iommu_flush_iec_index(iommu, 0, index); - - unmap_vtd_domain_page(iremap_entries); ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -140,7 +140,8 @@ static int context_get_domain_id(struct - } - - static int iommus_incoherent; --static void __iommu_flush_cache(void *addr, unsigned int size) -+ -+void iommu_sync_cache(const void *addr, unsigned int size) - { - int i; - static unsigned int clflush_size = 0; -@@ -155,16 +156,6 @@ static void __iommu_flush_cache(void *ad - cacheline_flush((char *)addr + i); - } - --void iommu_flush_cache_entry(void *addr, unsigned int size) --{ -- __iommu_flush_cache(addr, size); --} -- --void iommu_flush_cache_page(void *addr, unsigned long npages) --{ -- __iommu_flush_cache(addr, PAGE_SIZE * npages); --} -- - /* Allocate page table, return its machine address */ - uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) - { -@@ -183,7 +174,7 @@ uint64_t alloc_pgtable_maddr(unsigned lo - vaddr = __map_domain_page(cur_pg); - memset(vaddr, 0, PAGE_SIZE); - -- iommu_flush_cache_page(vaddr, 1); -+ iommu_sync_cache(vaddr, PAGE_SIZE); - unmap_domain_page(vaddr); - cur_pg++; - } -@@ -216,7 +207,7 @@ static u64 bus_to_context_maddr(struct v - } - set_root_value(*root, maddr); - set_root_present(*root); -- iommu_flush_cache_entry(root, sizeof(struct root_entry)); -+ iommu_sync_cache(root, sizeof(struct root_entry)); - } - maddr = (u64) get_context_addr(*root); - unmap_vtd_domain_page(root_entries); -@@ -263,7 +254,7 @@ static u64 addr_to_dma_page_maddr(struct - */ - dma_set_pte_readable(*pte); - dma_set_pte_writable(*pte); -- iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); -+ iommu_sync_cache(pte, sizeof(struct dma_pte)); - } - - if ( level == 2 ) -@@ -640,7 +631,7 @@ static int __must_check dma_pte_clear_on - *flush_flags |= IOMMU_FLUSHF_modified; - - spin_unlock(&hd->arch.mapping_lock); -- iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); -+ iommu_sync_cache(pte, sizeof(struct dma_pte)); - - unmap_vtd_domain_page(page); - -@@ -679,7 +670,7 @@ static void iommu_free_page_table(struct - iommu_free_pagetable(dma_pte_addr(*pte), next_level); - - dma_clear_pte(*pte); -- iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); -+ iommu_sync_cache(pte, sizeof(struct dma_pte)); - } - - unmap_vtd_domain_page(pt_vaddr); -@@ -1400,7 +1391,7 @@ int domain_context_mapping_one( - context_set_address_width(*context, agaw); - context_set_fault_enable(*context); - context_set_present(*context); -- iommu_flush_cache_entry(context, sizeof(struct context_entry)); -+ iommu_sync_cache(context, sizeof(struct context_entry)); - spin_unlock(&iommu->lock); - - /* Context entry was previously non-present (with domid 0). */ -@@ -1564,7 +1555,7 @@ int domain_context_unmap_one( - - context_clear_present(*context); - context_clear_entry(*context); -- iommu_flush_cache_entry(context, sizeof(struct context_entry)); -+ iommu_sync_cache(context, sizeof(struct context_entry)); - - iommu_domid= domain_iommu_domid(domain, iommu); - if ( iommu_domid == -1 ) -@@ -1791,7 +1782,7 @@ static int __must_check intel_iommu_map_ - - *pte = new; - -- iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); -+ iommu_sync_cache(pte, sizeof(struct dma_pte)); - spin_unlock(&hd->arch.mapping_lock); - unmap_vtd_domain_page(page); - -@@ -1866,7 +1857,7 @@ int iommu_pte_flush(struct domain *d, ui - int iommu_domid; - int rc = 0; - -- iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); -+ iommu_sync_cache(pte, sizeof(struct dma_pte)); - - for_each_drhd_unit ( drhd ) - { -@@ -2724,7 +2715,7 @@ static int __init intel_iommu_quarantine - dma_set_pte_addr(*pte, maddr); - dma_set_pte_readable(*pte); - } -- iommu_flush_cache_page(parent, 1); -+ iommu_sync_cache(parent, PAGE_SIZE); - - unmap_vtd_domain_page(parent); - parent = map_vtd_domain_page(maddr); diff --git a/system/xen/xsa/xsa328-post-xsa321-4.13-3.patch b/system/xen/xsa/xsa328-post-xsa321-4.13-3.patch deleted file mode 100644 index c141c4b785..0000000000 --- a/system/xen/xsa/xsa328-post-xsa321-4.13-3.patch +++ /dev/null @@ -1,82 +0,0 @@ -From: <security@xenproject.org> -Subject: x86/iommu: introduce a cache sync hook - -The hook is only implemented for VT-d and it uses the already existing -iommu_sync_cache function present in VT-d code. The new hook is -added so that the cache can be flushed by code outside of VT-d when -using shared page tables. - -Note that alloc_pgtable_maddr must use the now locally defined -sync_cache function, because IOMMU ops are not yet setup the first -time the function gets called during IOMMU initialization. - -No functional change intended. - -This is part of XSA-321. - -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -43,7 +43,6 @@ void disable_qinval(struct vtd_iommu *io - int enable_intremap(struct vtd_iommu *iommu, int eim); - void disable_intremap(struct vtd_iommu *iommu); - --void iommu_sync_cache(const void *addr, unsigned int size); - int iommu_alloc(struct acpi_drhd_unit *drhd); - void iommu_free(struct acpi_drhd_unit *drhd); - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -141,7 +141,7 @@ static int context_get_domain_id(struct - - static int iommus_incoherent; - --void iommu_sync_cache(const void *addr, unsigned int size) -+static void sync_cache(const void *addr, unsigned int size) - { - int i; - static unsigned int clflush_size = 0; -@@ -174,7 +174,7 @@ uint64_t alloc_pgtable_maddr(unsigned lo - vaddr = __map_domain_page(cur_pg); - memset(vaddr, 0, PAGE_SIZE); - -- iommu_sync_cache(vaddr, PAGE_SIZE); -+ sync_cache(vaddr, PAGE_SIZE); - unmap_domain_page(vaddr); - cur_pg++; - } -@@ -2763,6 +2763,7 @@ const struct iommu_ops __initconstrel in - .iotlb_flush_all = iommu_flush_iotlb_all, - .get_reserved_device_memory = intel_iommu_get_reserved_device_memory, - .dump_p2m_table = vtd_dump_p2m_table, -+ .sync_cache = sync_cache, - }; - - const struct iommu_init_ops __initconstrel intel_iommu_init_ops = { ---- a/xen/include/asm-x86/iommu.h -+++ b/xen/include/asm-x86/iommu.h -@@ -121,6 +121,13 @@ extern bool untrusted_msi; - int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq, - const uint8_t gvec); - -+#define iommu_sync_cache(addr, size) ({ \ -+ const struct iommu_ops *ops = iommu_get_ops(); \ -+ \ -+ if ( ops->sync_cache ) \ -+ iommu_vcall(ops, sync_cache, addr, size); \ -+}) -+ - #endif /* !__ARCH_X86_IOMMU_H__ */ - /* - * Local variables: ---- a/xen/include/xen/iommu.h -+++ b/xen/include/xen/iommu.h -@@ -250,6 +250,7 @@ struct iommu_ops { - int (*setup_hpet_msi)(struct msi_desc *); - - int (*adjust_irq_affinities)(void); -+ void (*sync_cache)(const void *addr, unsigned int size); - #endif /* CONFIG_X86 */ - - int __must_check (*suspend)(void); diff --git a/system/xen/xsa/xsa328-post-xsa321-4.13-4.patch b/system/xen/xsa/xsa328-post-xsa321-4.13-4.patch deleted file mode 100644 index 62bbcc7271..0000000000 --- a/system/xen/xsa/xsa328-post-xsa321-4.13-4.patch +++ /dev/null @@ -1,36 +0,0 @@ -From: <security@xenproject.org> -Subject: vtd: don't assume addresses are aligned in sync_cache - -Current code in sync_cache assume that the address passed in is -aligned to a cache line size. Fix the code to support passing in -arbitrary addresses not necessarily aligned to a cache line size. - -This is part of XSA-321. - -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -143,8 +143,8 @@ static int iommus_incoherent; - - static void sync_cache(const void *addr, unsigned int size) - { -- int i; -- static unsigned int clflush_size = 0; -+ static unsigned long clflush_size = 0; -+ const void *end = addr + size; - - if ( !iommus_incoherent ) - return; -@@ -152,8 +152,9 @@ static void sync_cache(const void *addr, - if ( clflush_size == 0 ) - clflush_size = get_cache_line_size(); - -- for ( i = 0; i < size; i += clflush_size ) -- cacheline_flush((char *)addr + i); -+ addr -= (unsigned long)addr & (clflush_size - 1); -+ for ( ; addr < end; addr += clflush_size ) -+ cacheline_flush((char *)addr); - } - - /* Allocate page table, return its machine address */ diff --git a/system/xen/xsa/xsa328-post-xsa321-4.13-5.patch b/system/xen/xsa/xsa328-post-xsa321-4.13-5.patch deleted file mode 100644 index 60cfe6ccdf..0000000000 --- a/system/xen/xsa/xsa328-post-xsa321-4.13-5.patch +++ /dev/null @@ -1,24 +0,0 @@ -From: <security@xenproject.org> -Subject: x86/alternative: introduce alternative_2 - -It's based on alternative_io_2 without inputs or outputs but with an -added memory clobber. - -This is part of XSA-321. - -Acked-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/include/asm-x86/alternative.h -+++ b/xen/include/asm-x86/alternative.h -@@ -114,6 +114,11 @@ extern void alternative_branches(void); - #define alternative(oldinstr, newinstr, feature) \ - asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") - -+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ -+ asm volatile (ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ -+ newinstr2, feature2) \ -+ : : : "memory") -+ - /* - * Alternative inline assembly with input. - * diff --git a/system/xen/xsa/xsa328-post-xsa321-4.13-6.patch b/system/xen/xsa/xsa328-post-xsa321-4.13-6.patch deleted file mode 100644 index 4c5c5ab0ba..0000000000 --- a/system/xen/xsa/xsa328-post-xsa321-4.13-6.patch +++ /dev/null @@ -1,91 +0,0 @@ -From: <security@xenproject.org> -Subject: vtd: optimize CPU cache sync - -Some VT-d IOMMUs are non-coherent, which requires a cache write back -in order for the changes made by the CPU to be visible to the IOMMU. -This cache write back was unconditionally done using clflush, but there are -other more efficient instructions to do so, hence implement support -for them using the alternative framework. - -This is part of XSA-321. - -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -68,7 +68,6 @@ int __must_check qinval_device_iotlb_syn - u16 did, u16 size, u64 addr); - - unsigned int get_cache_line_size(void); --void cacheline_flush(char *); - void flush_all_cache(void); - - uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node); ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -31,6 +31,7 @@ - #include <xen/pci_regs.h> - #include <xen/keyhandler.h> - #include <asm/msi.h> -+#include <asm/nops.h> - #include <asm/irq.h> - #include <asm/hvm/vmx/vmx.h> - #include <asm/p2m.h> -@@ -154,7 +155,42 @@ static void sync_cache(const void *addr, - - addr -= (unsigned long)addr & (clflush_size - 1); - for ( ; addr < end; addr += clflush_size ) -- cacheline_flush((char *)addr); -+/* -+ * The arguments to a macro must not include preprocessor directives. Doing so -+ * results in undefined behavior, so we have to create some defines here in -+ * order to avoid it. -+ */ -+#if defined(HAVE_AS_CLWB) -+# define CLWB_ENCODING "clwb %[p]" -+#elif defined(HAVE_AS_XSAVEOPT) -+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ -+#else -+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ -+#endif -+ -+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) -+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) -+# define INPUT BASE_INPUT -+#else -+# define INPUT(addr) "a" (addr), BASE_INPUT(addr) -+#endif -+ /* -+ * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush -+ * + prefix than a clflush + nop, and hence the prefix is added instead -+ * of letting the alternative framework fill the gap by appending nops. -+ */ -+ alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]", -+ "data16 clflush %[p]", /* clflushopt */ -+ X86_FEATURE_CLFLUSHOPT, -+ CLWB_ENCODING, -+ X86_FEATURE_CLWB, /* no outputs */, -+ INPUT(addr)); -+#undef INPUT -+#undef BASE_INPUT -+#undef CLWB_ENCODING -+ -+ alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT, -+ "sfence", X86_FEATURE_CLWB); - } - - /* Allocate page table, return its machine address */ ---- a/xen/drivers/passthrough/vtd/x86/vtd.c -+++ b/xen/drivers/passthrough/vtd/x86/vtd.c -@@ -51,11 +51,6 @@ unsigned int get_cache_line_size(void) - return ((cpuid_ebx(1) >> 8) & 0xff) * 8; - } - --void cacheline_flush(char * addr) --{ -- clflush(addr); --} -- - void flush_all_cache() - { - wbinvd(); diff --git a/system/xen/xsa/xsa328-post-xsa321-4.13-7.patch b/system/xen/xsa/xsa328-post-xsa321-4.13-7.patch deleted file mode 100644 index 0bd018f972..0000000000 --- a/system/xen/xsa/xsa328-post-xsa321-4.13-7.patch +++ /dev/null @@ -1,153 +0,0 @@ -From: <security@xenproject.org> -Subject: x86/ept: flush cache when modifying PTEs and sharing page tables - -Modifications made to the page tables by EPT code need to be written -to memory when the page tables are shared with the IOMMU, as Intel -IOMMUs can be non-coherent and thus require changes to be written to -memory in order to be visible to the IOMMU. - -In order to achieve this make sure data is written back to memory -after writing an EPT entry when the recalc bit is not set in -atomic_write_ept_entry. If such bit is set, the entry will be -adjusted and atomic_write_ept_entry will be called a second time -without the recalc bit set. Note that when splitting a super page the -new tables resulting of the split should also be written back. - -Failure to do so can allow devices behind the IOMMU access to the -stale super page, or cause coherency issues as changes made by the -processor to the page tables are not visible to the IOMMU. - -This allows to remove the VT-d specific iommu_pte_flush helper, since -the cache write back is now performed by atomic_write_ept_entry, and -hence iommu_iotlb_flush can be used to flush the IOMMU TLB. The newly -used method (iommu_iotlb_flush) can result in less flushes, since it -might sometimes be called rightly with 0 flags, in which case it -becomes a no-op. - -This is part of XSA-321. - -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/arch/x86/mm/p2m-ept.c -+++ b/xen/arch/x86/mm/p2m-ept.c -@@ -58,6 +58,19 @@ static int atomic_write_ept_entry(struct - - write_atomic(&entryptr->epte, new.epte); - -+ /* -+ * The recalc field on the EPT is used to signal either that a -+ * recalculation of the EMT field is required (which doesn't effect the -+ * IOMMU), or a type change. Type changes can only be between ram_rw, -+ * logdirty and ioreq_server: changes to/from logdirty won't work well with -+ * an IOMMU anyway, as IOMMU #PFs are not synchronous and will lead to -+ * aborts, and changes to/from ioreq_server are already fully flushed -+ * before returning to guest context (see -+ * XEN_DMOP_map_mem_type_to_ioreq_server). -+ */ -+ if ( !new.recalc && iommu_use_hap_pt(p2m->domain) ) -+ iommu_sync_cache(entryptr, sizeof(*entryptr)); -+ - return 0; - } - -@@ -278,6 +291,9 @@ static bool_t ept_split_super_page(struc - break; - } - -+ if ( iommu_use_hap_pt(p2m->domain) ) -+ iommu_sync_cache(table, EPT_PAGETABLE_ENTRIES * sizeof(ept_entry_t)); -+ - unmap_domain_page(table); - - /* Even failed we should install the newly allocated ept page. */ -@@ -337,6 +353,9 @@ static int ept_next_level(struct p2m_dom - if ( !next ) - return GUEST_TABLE_MAP_FAILED; - -+ if ( iommu_use_hap_pt(p2m->domain) ) -+ iommu_sync_cache(next, EPT_PAGETABLE_ENTRIES * sizeof(ept_entry_t)); -+ - rc = atomic_write_ept_entry(p2m, ept_entry, e, next_level); - ASSERT(rc == 0); - } -@@ -821,7 +840,10 @@ out: - need_modify_vtd_table ) - { - if ( iommu_use_hap_pt(d) ) -- rc = iommu_pte_flush(d, gfn, &ept_entry->epte, order, vtd_pte_present); -+ rc = iommu_iotlb_flush(d, _dfn(gfn), (1u << order), -+ (iommu_flags ? IOMMU_FLUSHF_added : 0) | -+ (vtd_pte_present ? IOMMU_FLUSHF_modified -+ : 0)); - else if ( need_iommu_pt_sync(d) ) - rc = iommu_flags ? - iommu_legacy_map(d, _dfn(gfn), mfn, order, iommu_flags) : ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -1884,53 +1884,6 @@ static int intel_iommu_lookup_page(struc - return 0; - } - --int iommu_pte_flush(struct domain *d, uint64_t dfn, uint64_t *pte, -- int order, int present) --{ -- struct acpi_drhd_unit *drhd; -- struct vtd_iommu *iommu = NULL; -- struct domain_iommu *hd = dom_iommu(d); -- bool_t flush_dev_iotlb; -- int iommu_domid; -- int rc = 0; -- -- iommu_sync_cache(pte, sizeof(struct dma_pte)); -- -- for_each_drhd_unit ( drhd ) -- { -- iommu = drhd->iommu; -- if ( !test_bit(iommu->index, &hd->arch.iommu_bitmap) ) -- continue; -- -- flush_dev_iotlb = !!find_ats_dev_drhd(iommu); -- iommu_domid= domain_iommu_domid(d, iommu); -- if ( iommu_domid == -1 ) -- continue; -- -- rc = iommu_flush_iotlb_psi(iommu, iommu_domid, -- __dfn_to_daddr(dfn), -- order, !present, flush_dev_iotlb); -- if ( rc > 0 ) -- { -- iommu_flush_write_buffer(iommu); -- rc = 0; -- } -- } -- -- if ( unlikely(rc) ) -- { -- if ( !d->is_shutting_down && printk_ratelimit() ) -- printk(XENLOG_ERR VTDPREFIX -- " d%d: IOMMU pages flush failed: %d\n", -- d->domain_id, rc); -- -- if ( !is_hardware_domain(d) ) -- domain_crash(d); -- } -- -- return rc; --} -- - static int __init vtd_ept_page_compatible(struct vtd_iommu *iommu) - { - u64 ept_cap, vtd_cap = iommu->cap; ---- a/xen/include/asm-x86/iommu.h -+++ b/xen/include/asm-x86/iommu.h -@@ -97,10 +97,6 @@ static inline int iommu_adjust_irq_affin - : 0; - } - --/* While VT-d specific, this must get declared in a generic header. */ --int __must_check iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, -- int order, int present); -- - static inline bool iommu_supports_x2apic(void) - { - return iommu_init_ops && iommu_init_ops->supports_x2apic diff --git a/system/xen/xsa/xsa333.patch b/system/xen/xsa/xsa333.patch deleted file mode 100644 index 6b86c942fa..0000000000 --- a/system/xen/xsa/xsa333.patch +++ /dev/null @@ -1,39 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/pv: Handle the Intel-specific MSR_MISC_ENABLE correctly - -This MSR doesn't exist on AMD hardware, and switching away from the safe -functions in the common MSR path was an erroneous change. - -Partially revert the change. - -This is XSA-333. - -Fixes: 4fdc932b3cc ("x86/Intel: drop another 32-bit leftover") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Wei Liu <wl@xen.org> - -diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c -index efeb2a727e..6332c74b80 100644 ---- a/xen/arch/x86/pv/emul-priv-op.c -+++ b/xen/arch/x86/pv/emul-priv-op.c -@@ -924,7 +924,8 @@ static int read_msr(unsigned int reg, uint64_t *val, - return X86EMUL_OKAY; - - case MSR_IA32_MISC_ENABLE: -- rdmsrl(reg, *val); -+ if ( rdmsr_safe(reg, *val) ) -+ break; - *val = guest_misc_enable(*val); - return X86EMUL_OKAY; - -@@ -1059,7 +1060,8 @@ static int write_msr(unsigned int reg, uint64_t val, - break; - - case MSR_IA32_MISC_ENABLE: -- rdmsrl(reg, temp); -+ if ( rdmsr_safe(reg, temp) ) -+ break; - if ( val != guest_misc_enable(temp) ) - goto invalid; - return X86EMUL_OKAY; diff --git a/system/xen/xsa/xsa334.patch b/system/xen/xsa/xsa334.patch deleted file mode 100644 index 4260cdb2b2..0000000000 --- a/system/xen/xsa/xsa334.patch +++ /dev/null @@ -1,51 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: xen/memory: Don't skip the RCU unlock path in acquire_resource() - -In the case that an HVM Stubdomain makes an XENMEM_acquire_resource hypercall, -the FIXME path will bypass rcu_unlock_domain() on the way out of the function. - -Move the check to the start of the function. This does change the behaviour -of the get-size path for HVM Stubdomains, but that functionality is currently -broken and unused anyway, as well as being quite useless to entities which -can't actually map the resource anyway. - -This is XSA-334. - -Fixes: 83fa6552ce ("common: add a new mappable resource type: XENMEM_resource_grant_table") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/xen/common/memory.c b/xen/common/memory.c -index 1a3c9ffb30..29741d8904 100644 ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -1058,6 +1058,14 @@ static int acquire_resource( - xen_pfn_t mfn_list[32]; - int rc; - -+ /* -+ * FIXME: Until foreign pages inserted into the P2M are properly -+ * reference counted, it is unsafe to allow mapping of -+ * resource pages unless the caller is the hardware domain. -+ */ -+ if ( paging_mode_translate(currd) && !is_hardware_domain(currd) ) -+ return -EACCES; -+ - if ( copy_from_guest(&xmar, arg, 1) ) - return -EFAULT; - -@@ -1114,14 +1122,6 @@ static int acquire_resource( - xen_pfn_t gfn_list[ARRAY_SIZE(mfn_list)]; - unsigned int i; - -- /* -- * FIXME: Until foreign pages inserted into the P2M are properly -- * reference counted, it is unsafe to allow mapping of -- * resource pages unless the caller is the hardware domain. -- */ -- if ( !is_hardware_domain(currd) ) -- return -EACCES; -- - if ( copy_from_guest(gfn_list, xmar.frame_list, xmar.nr_frames) ) - rc = -EFAULT; - diff --git a/system/xen/xsa/xsa335-qemu.patch b/system/xen/xsa/xsa335-qemu.patch deleted file mode 100644 index 5617502359..0000000000 --- a/system/xen/xsa/xsa335-qemu.patch +++ /dev/null @@ -1,84 +0,0 @@ -From c5bd2924c6d6a5bcbffb8b5e7798a88970131c07 Mon Sep 17 00:00:00 2001 -From: Gerd Hoffmann <kraxel@redhat.com> -Date: Mon, 17 Aug 2020 08:34:22 +0200 -Subject: [PATCH] usb: fix setup_len init (CVE-2020-14364) - -Store calculated setup_len in a local variable, verify it, and only -write it to the struct (USBDevice->setup_len) in case it passed the -sanity checks. - -This prevents other code (do_token_{in,out} functions specifically) -from working with invalid USBDevice->setup_len values and overrunning -the USBDevice->setup_buf[] buffer. - -Fixes: CVE-2020-14364 -Signed-off-by: Gerd Hoffmann <kraxel@redhat.com> ---- - hw/usb/core.c | 16 ++++++++++------ - 1 file changed, 10 insertions(+), 6 deletions(-) - -diff --git a/hw/usb/core.c b/hw/usb/core.c -index 5abd128b6bc5..5234dcc73fea 100644 ---- a/hw/usb/core.c -+++ b/hw/usb/core.c -@@ -129,6 +129,7 @@ void usb_wakeup(USBEndpoint *ep, unsigned int stream) - static void do_token_setup(USBDevice *s, USBPacket *p) - { - int request, value, index; -+ unsigned int setup_len; - - if (p->iov.size != 8) { - p->status = USB_RET_STALL; -@@ -138,14 +139,15 @@ static void do_token_setup(USBDevice *s, USBPacket *p) - usb_packet_copy(p, s->setup_buf, p->iov.size); - s->setup_index = 0; - p->actual_length = 0; -- s->setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; -- if (s->setup_len > sizeof(s->data_buf)) { -+ setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; -+ if (setup_len > sizeof(s->data_buf)) { - fprintf(stderr, - "usb_generic_handle_packet: ctrl buffer too small (%d > %zu)\n", -- s->setup_len, sizeof(s->data_buf)); -+ setup_len, sizeof(s->data_buf)); - p->status = USB_RET_STALL; - return; - } -+ s->setup_len = setup_len; - - request = (s->setup_buf[0] << 8) | s->setup_buf[1]; - value = (s->setup_buf[3] << 8) | s->setup_buf[2]; -@@ -259,26 +261,28 @@ static void do_token_out(USBDevice *s, USBPacket *p) - static void do_parameter(USBDevice *s, USBPacket *p) - { - int i, request, value, index; -+ unsigned int setup_len; - - for (i = 0; i < 8; i++) { - s->setup_buf[i] = p->parameter >> (i*8); - } - - s->setup_state = SETUP_STATE_PARAM; -- s->setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; - s->setup_index = 0; - - request = (s->setup_buf[0] << 8) | s->setup_buf[1]; - value = (s->setup_buf[3] << 8) | s->setup_buf[2]; - index = (s->setup_buf[5] << 8) | s->setup_buf[4]; - -- if (s->setup_len > sizeof(s->data_buf)) { -+ setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; -+ if (setup_len > sizeof(s->data_buf)) { - fprintf(stderr, - "usb_generic_handle_packet: ctrl buffer too small (%d > %zu)\n", -- s->setup_len, sizeof(s->data_buf)); -+ setup_len, sizeof(s->data_buf)); - p->status = USB_RET_STALL; - return; - } -+ s->setup_len = setup_len; - - if (p->pid == USB_TOKEN_OUT) { - usb_packet_copy(p, s->data_buf, s->setup_len); --- -2.18.4 diff --git a/system/xen/xsa/xsa336.patch b/system/xen/xsa/xsa336.patch deleted file mode 100644 index b44c298b70..0000000000 --- a/system/xen/xsa/xsa336.patch +++ /dev/null @@ -1,283 +0,0 @@ -From: Roger Pau Monné <roger.pau@citrix.com> -Subject: x86/vpt: fix race when migrating timers between vCPUs - -The current vPT code will migrate the emulated timers between vCPUs -(change the pt->vcpu field) while just holding the destination lock, -either from create_periodic_time or pt_adjust_global_vcpu_target if -the global target is adjusted. Changing the periodic_timer vCPU field -in this way creates a race where a third party could grab the lock in -the unlocked region of pt_adjust_global_vcpu_target (or before -create_periodic_time performs the vcpu change) and then release the -lock from a different vCPU, creating a locking imbalance. - -Introduce a per-domain rwlock in order to protect periodic_time -migration between vCPU lists. Taking the lock in read mode prevents -any timer from being migrated to a different vCPU, while taking it in -write mode allows performing migration of timers across vCPUs. The -per-vcpu locks are still used to protect all the other fields from the -periodic_timer struct. - -Note that such migration shouldn't happen frequently, and hence -there's no performance drop as a result of such locking. - -This is XSA-336. - -Reported-by: Igor Druzhinin <igor.druzhinin@citrix.com> -Tested-by: Igor Druzhinin <igor.druzhinin@citrix.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- -Changes since v2: - - Re-order pt_adjust_vcpu to remove one if. - - Fix pt_lock to not call pt_vcpu_lock, as we might end up using a - stale value of pt->vcpu when taking the per-vcpu lock. - -Changes since v1: - - Use a per-domain rwlock to protect timer vCPU migration. - ---- a/xen/arch/x86/hvm/hvm.c -+++ b/xen/arch/x86/hvm/hvm.c -@@ -658,6 +658,8 @@ int hvm_domain_initialise(struct domain - /* need link to containing domain */ - d->arch.hvm.pl_time->domain = d; - -+ rwlock_init(&d->arch.hvm.pl_time->pt_migrate); -+ - /* Set the default IO Bitmap. */ - if ( is_hardware_domain(d) ) - { ---- a/xen/arch/x86/hvm/vpt.c -+++ b/xen/arch/x86/hvm/vpt.c -@@ -153,23 +153,32 @@ static int pt_irq_masked(struct periodic - return 1; - } - --static void pt_lock(struct periodic_time *pt) -+static void pt_vcpu_lock(struct vcpu *v) - { -- struct vcpu *v; -+ read_lock(&v->domain->arch.hvm.pl_time->pt_migrate); -+ spin_lock(&v->arch.hvm.tm_lock); -+} - -- for ( ; ; ) -- { -- v = pt->vcpu; -- spin_lock(&v->arch.hvm.tm_lock); -- if ( likely(pt->vcpu == v) ) -- break; -- spin_unlock(&v->arch.hvm.tm_lock); -- } -+static void pt_vcpu_unlock(struct vcpu *v) -+{ -+ spin_unlock(&v->arch.hvm.tm_lock); -+ read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate); -+} -+ -+static void pt_lock(struct periodic_time *pt) -+{ -+ /* -+ * We cannot use pt_vcpu_lock here, because we need to acquire the -+ * per-domain lock first and then (re-)fetch the value of pt->vcpu, or -+ * else we might be using a stale value of pt->vcpu. -+ */ -+ read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); -+ spin_lock(&pt->vcpu->arch.hvm.tm_lock); - } - - static void pt_unlock(struct periodic_time *pt) - { -- spin_unlock(&pt->vcpu->arch.hvm.tm_lock); -+ pt_vcpu_unlock(pt->vcpu); - } - - static void pt_process_missed_ticks(struct periodic_time *pt) -@@ -219,7 +228,7 @@ void pt_save_timer(struct vcpu *v) - if ( v->pause_flags & VPF_blocked ) - return; - -- spin_lock(&v->arch.hvm.tm_lock); -+ pt_vcpu_lock(v); - - list_for_each_entry ( pt, head, list ) - if ( !pt->do_not_freeze ) -@@ -227,7 +236,7 @@ void pt_save_timer(struct vcpu *v) - - pt_freeze_time(v); - -- spin_unlock(&v->arch.hvm.tm_lock); -+ pt_vcpu_unlock(v); - } - - void pt_restore_timer(struct vcpu *v) -@@ -235,7 +244,7 @@ void pt_restore_timer(struct vcpu *v) - struct list_head *head = &v->arch.hvm.tm_list; - struct periodic_time *pt; - -- spin_lock(&v->arch.hvm.tm_lock); -+ pt_vcpu_lock(v); - - list_for_each_entry ( pt, head, list ) - { -@@ -248,7 +257,7 @@ void pt_restore_timer(struct vcpu *v) - - pt_thaw_time(v); - -- spin_unlock(&v->arch.hvm.tm_lock); -+ pt_vcpu_unlock(v); - } - - static void pt_timer_fn(void *data) -@@ -309,7 +318,7 @@ int pt_update_irq(struct vcpu *v) - int irq, pt_vector = -1; - bool level; - -- spin_lock(&v->arch.hvm.tm_lock); -+ pt_vcpu_lock(v); - - earliest_pt = NULL; - max_lag = -1ULL; -@@ -339,7 +348,7 @@ int pt_update_irq(struct vcpu *v) - - if ( earliest_pt == NULL ) - { -- spin_unlock(&v->arch.hvm.tm_lock); -+ pt_vcpu_unlock(v); - return -1; - } - -@@ -347,7 +356,7 @@ int pt_update_irq(struct vcpu *v) - irq = earliest_pt->irq; - level = earliest_pt->level; - -- spin_unlock(&v->arch.hvm.tm_lock); -+ pt_vcpu_unlock(v); - - switch ( earliest_pt->source ) - { -@@ -394,7 +403,7 @@ int pt_update_irq(struct vcpu *v) - time_cb *cb = NULL; - void *cb_priv; - -- spin_lock(&v->arch.hvm.tm_lock); -+ pt_vcpu_lock(v); - /* Make sure the timer is still on the list. */ - list_for_each_entry ( pt, &v->arch.hvm.tm_list, list ) - if ( pt == earliest_pt ) -@@ -404,7 +413,7 @@ int pt_update_irq(struct vcpu *v) - cb_priv = pt->priv; - break; - } -- spin_unlock(&v->arch.hvm.tm_lock); -+ pt_vcpu_unlock(v); - - if ( cb != NULL ) - cb(v, cb_priv); -@@ -441,12 +450,12 @@ void pt_intr_post(struct vcpu *v, struct - if ( intack.source == hvm_intsrc_vector ) - return; - -- spin_lock(&v->arch.hvm.tm_lock); -+ pt_vcpu_lock(v); - - pt = is_pt_irq(v, intack); - if ( pt == NULL ) - { -- spin_unlock(&v->arch.hvm.tm_lock); -+ pt_vcpu_unlock(v); - return; - } - -@@ -455,7 +464,7 @@ void pt_intr_post(struct vcpu *v, struct - cb = pt->cb; - cb_priv = pt->priv; - -- spin_unlock(&v->arch.hvm.tm_lock); -+ pt_vcpu_unlock(v); - - if ( cb != NULL ) - cb(v, cb_priv); -@@ -466,12 +475,12 @@ void pt_migrate(struct vcpu *v) - struct list_head *head = &v->arch.hvm.tm_list; - struct periodic_time *pt; - -- spin_lock(&v->arch.hvm.tm_lock); -+ pt_vcpu_lock(v); - - list_for_each_entry ( pt, head, list ) - migrate_timer(&pt->timer, v->processor); - -- spin_unlock(&v->arch.hvm.tm_lock); -+ pt_vcpu_unlock(v); - } - - void create_periodic_time( -@@ -490,7 +499,7 @@ void create_periodic_time( - - destroy_periodic_time(pt); - -- spin_lock(&v->arch.hvm.tm_lock); -+ write_lock(&v->domain->arch.hvm.pl_time->pt_migrate); - - pt->pending_intr_nr = 0; - pt->do_not_freeze = 0; -@@ -540,7 +549,7 @@ void create_periodic_time( - init_timer(&pt->timer, pt_timer_fn, pt, v->processor); - set_timer(&pt->timer, pt->scheduled); - -- spin_unlock(&v->arch.hvm.tm_lock); -+ write_unlock(&v->domain->arch.hvm.pl_time->pt_migrate); - } - - void destroy_periodic_time(struct periodic_time *pt) -@@ -565,30 +574,20 @@ void destroy_periodic_time(struct period - - static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v) - { -- int on_list; -- - ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic); - - if ( pt->vcpu == NULL ) - return; - -- pt_lock(pt); -- on_list = pt->on_list; -- if ( pt->on_list ) -- list_del(&pt->list); -- pt->on_list = 0; -- pt_unlock(pt); -- -- spin_lock(&v->arch.hvm.tm_lock); -+ write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); - pt->vcpu = v; -- if ( on_list ) -+ if ( pt->on_list ) - { -- pt->on_list = 1; -+ list_del(&pt->list); - list_add(&pt->list, &v->arch.hvm.tm_list); -- - migrate_timer(&pt->timer, v->processor); - } -- spin_unlock(&v->arch.hvm.tm_lock); -+ write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); - } - - void pt_adjust_global_vcpu_target(struct vcpu *v) ---- a/xen/include/asm-x86/hvm/vpt.h -+++ b/xen/include/asm-x86/hvm/vpt.h -@@ -128,6 +128,13 @@ struct pl_time { /* platform time */ - struct RTCState vrtc; - struct HPETState vhpet; - struct PMTState vpmt; -+ /* -+ * rwlock to prevent periodic_time vCPU migration. Take the lock in read -+ * mode in order to prevent the vcpu field of periodic_time from changing. -+ * Lock must be taken in write mode when changes to the vcpu field are -+ * performed, as it allows exclusive access to all the timers of a domain. -+ */ -+ rwlock_t pt_migrate; - /* guest_time = Xen sys time + stime_offset */ - int64_t stime_offset; - /* Ensures monotonicity in appropriate timer modes. */ diff --git a/system/xen/xsa/xsa337-4.13-1.patch b/system/xen/xsa/xsa337-4.13-1.patch deleted file mode 100644 index 2091626f4f..0000000000 --- a/system/xen/xsa/xsa337-4.13-1.patch +++ /dev/null @@ -1,87 +0,0 @@ -From: Roger Pau Monné <roger.pau@citrix.com> -Subject: x86/msi: get rid of read_msi_msg - -It's safer and faster to just use the cached last written -(untranslated) MSI message stored in msi_desc for the single user that -calls read_msi_msg. - -This also prevents relying on the data read from the device MSI -registers in order to figure out the index into the IOMMU interrupt -remapping table, which is not safe. - -This is part of XSA-337. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Requested-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - ---- a/xen/arch/x86/msi.c -+++ b/xen/arch/x86/msi.c -@@ -183,54 +183,6 @@ void msi_compose_msg(unsigned vector, co - MSI_DATA_VECTOR(vector); - } - --static bool read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) --{ -- switch ( entry->msi_attrib.type ) -- { -- case PCI_CAP_ID_MSI: -- { -- struct pci_dev *dev = entry->dev; -- int pos = entry->msi_attrib.pos; -- uint16_t data; -- -- msg->address_lo = pci_conf_read32(dev->sbdf, -- msi_lower_address_reg(pos)); -- if ( entry->msi_attrib.is_64 ) -- { -- msg->address_hi = pci_conf_read32(dev->sbdf, -- msi_upper_address_reg(pos)); -- data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 1)); -- } -- else -- { -- msg->address_hi = 0; -- data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 0)); -- } -- msg->data = data; -- break; -- } -- case PCI_CAP_ID_MSIX: -- { -- void __iomem *base = entry->mask_base; -- -- if ( unlikely(!msix_memory_decoded(entry->dev, -- entry->msi_attrib.pos)) ) -- return false; -- msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); -- msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); -- msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); -- break; -- } -- default: -- BUG(); -- } -- -- if ( iommu_intremap ) -- iommu_read_msi_from_ire(entry, msg); -- -- return true; --} -- - static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) - { - entry->msg = *msg; -@@ -302,10 +254,7 @@ void set_msi_affinity(struct irq_desc *d - - ASSERT(spin_is_locked(&desc->lock)); - -- memset(&msg, 0, sizeof(msg)); -- if ( !read_msi_msg(msi_desc, &msg) ) -- return; -- -+ msg = msi_desc->msg; - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(desc->arch.vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; diff --git a/system/xen/xsa/xsa337-4.13-2.patch b/system/xen/xsa/xsa337-4.13-2.patch deleted file mode 100644 index bdefd37cdc..0000000000 --- a/system/xen/xsa/xsa337-4.13-2.patch +++ /dev/null @@ -1,181 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: x86/MSI-X: restrict reading of table/PBA bases from BARs - -When assigned to less trusted or un-trusted guests, devices may change -state behind our backs (they may e.g. get reset by means we may not know -about). Therefore we should avoid reading BARs from hardware once a -device is no longer owned by Dom0. Furthermore when we can't read a BAR, -or when we read zero, we shouldn't instead use the caller provided -address unless that caller can be trusted. - -Re-arrange the logic in msix_capability_init() such that only Dom0 (and -only if the device isn't DomU-owned yet) or calls through -PHYSDEVOP_prepare_msix will actually result in the reading of the -respective BAR register(s). Additionally do so only as long as in-use -table entries are known (note that invocation of PHYSDEVOP_prepare_msix -counts as a "pseudo" entry). In all other uses the value already -recorded will get used instead. - -Clear the recorded values in _pci_cleanup_msix() as well as on the one -affected error path. (Adjust this error path to also avoid blindly -disabling MSI-X when it was enabled on entry to the function.) - -While moving around variable declarations (in many cases to reduce their -scopes), also adjust some of their types. - -This is part of XSA-337. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> - ---- a/xen/arch/x86/msi.c -+++ b/xen/arch/x86/msi.c -@@ -769,16 +769,14 @@ static int msix_capability_init(struct p - { - struct arch_msix *msix = dev->msix; - struct msi_desc *entry = NULL; -- int vf; - u16 control; - u64 table_paddr; - u32 table_offset; -- u8 bir, pbus, pslot, pfunc; - u16 seg = dev->seg; - u8 bus = dev->bus; - u8 slot = PCI_SLOT(dev->devfn); - u8 func = PCI_FUNC(dev->devfn); -- bool maskall = msix->host_maskall; -+ bool maskall = msix->host_maskall, zap_on_error = false; - unsigned int pos = pci_find_cap_offset(seg, bus, slot, func, - PCI_CAP_ID_MSIX); - -@@ -820,43 +818,45 @@ static int msix_capability_init(struct p - - /* Locate MSI-X table region */ - table_offset = pci_conf_read32(dev->sbdf, msix_table_offset_reg(pos)); -- bir = (u8)(table_offset & PCI_MSIX_BIRMASK); -- table_offset &= ~PCI_MSIX_BIRMASK; -+ if ( !msix->used_entries && -+ (!msi || -+ (is_hardware_domain(current->domain) && -+ (dev->domain == current->domain || dev->domain == dom_io))) ) -+ { -+ unsigned int bir = table_offset & PCI_MSIX_BIRMASK, pbus, pslot, pfunc; -+ int vf; -+ paddr_t pba_paddr; -+ unsigned int pba_offset; - -- if ( !dev->info.is_virtfn ) -- { -- pbus = bus; -- pslot = slot; -- pfunc = func; -- vf = -1; -- } -- else -- { -- pbus = dev->info.physfn.bus; -- pslot = PCI_SLOT(dev->info.physfn.devfn); -- pfunc = PCI_FUNC(dev->info.physfn.devfn); -- vf = PCI_BDF2(dev->bus, dev->devfn); -- } -- -- table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf); -- WARN_ON(msi && msi->table_base != table_paddr); -- if ( !table_paddr ) -- { -- if ( !msi || !msi->table_base ) -+ if ( !dev->info.is_virtfn ) - { -- pci_conf_write16(dev->sbdf, msix_control_reg(pos), -- control & ~PCI_MSIX_FLAGS_ENABLE); -- xfree(entry); -- return -ENXIO; -+ pbus = bus; -+ pslot = slot; -+ pfunc = func; -+ vf = -1; -+ } -+ else -+ { -+ pbus = dev->info.physfn.bus; -+ pslot = PCI_SLOT(dev->info.physfn.devfn); -+ pfunc = PCI_FUNC(dev->info.physfn.devfn); -+ vf = PCI_BDF2(dev->bus, dev->devfn); - } -- table_paddr = msi->table_base; -- } -- table_paddr += table_offset; - -- if ( !msix->used_entries ) -- { -- u64 pba_paddr; -- u32 pba_offset; -+ table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf); -+ WARN_ON(msi && msi->table_base != table_paddr); -+ if ( !table_paddr ) -+ { -+ if ( !msi || !msi->table_base ) -+ { -+ pci_conf_write16(dev->sbdf, msix_control_reg(pos), -+ control & ~PCI_MSIX_FLAGS_ENABLE); -+ xfree(entry); -+ return -ENXIO; -+ } -+ table_paddr = msi->table_base; -+ } -+ table_paddr += table_offset & ~PCI_MSIX_BIRMASK; - - msix->table.first = PFN_DOWN(table_paddr); - msix->table.last = PFN_DOWN(table_paddr + -@@ -875,7 +875,18 @@ static int msix_capability_init(struct p - BITS_TO_LONGS(msix->nr_entries) - 1); - WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first, - msix->pba.last)); -+ -+ zap_on_error = true; -+ } -+ else if ( !msix->table.first ) -+ { -+ pci_conf_write16(dev->sbdf, msix_control_reg(pos), control); -+ xfree(entry); -+ return -ENODATA; - } -+ else -+ table_paddr = (msix->table.first << PAGE_SHIFT) + -+ (table_offset & ~PCI_MSIX_BIRMASK & ~PAGE_MASK); - - if ( entry ) - { -@@ -886,8 +897,15 @@ static int msix_capability_init(struct p - - if ( idx < 0 ) - { -- pci_conf_write16(dev->sbdf, msix_control_reg(pos), -- control & ~PCI_MSIX_FLAGS_ENABLE); -+ if ( zap_on_error ) -+ { -+ msix->table.first = 0; -+ msix->pba.first = 0; -+ -+ control &= ~PCI_MSIX_FLAGS_ENABLE; -+ } -+ -+ pci_conf_write16(dev->sbdf, msix_control_reg(pos), control); - xfree(entry); - return idx; - } -@@ -1076,9 +1094,14 @@ static void _pci_cleanup_msix(struct arc - if ( rangeset_remove_range(mmio_ro_ranges, msix->table.first, - msix->table.last) ) - WARN(); -+ msix->table.first = 0; -+ msix->table.last = 0; -+ - if ( rangeset_remove_range(mmio_ro_ranges, msix->pba.first, - msix->pba.last) ) - WARN(); -+ msix->pba.first = 0; -+ msix->pba.last = 0; - } - } - diff --git a/system/xen/xsa/xsa338.patch b/system/xen/xsa/xsa338.patch deleted file mode 100644 index 776521990e..0000000000 --- a/system/xen/xsa/xsa338.patch +++ /dev/null @@ -1,42 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: evtchn: relax port_is_valid() - -To avoid ports potentially becoming invalid behind the back of certain -other functions (due to ->max_evtchn shrinking) because of -- a guest invoking evtchn_reset() and from a 2nd vCPU opening new - channels in parallel (see also XSA-343), -- alloc_unbound_xen_event_channel() produced channels living above the - 2-level range (see also XSA-342), -drop the max_evtchns check from port_is_valid(). For a port for which -the function once returned "true", the returned value may not turn into -"false" later on. The function's result may only depend on bounds which -can only ever grow (which is the case for d->valid_evtchns). - -This also eliminates a false sense of safety, utilized by some of the -users (see again XSA-343): Without a suitable lock held, d->max_evtchns -may change at any time, and hence deducing that certain other operations -are safe when port_is_valid() returned true is not legitimate. The -opportunities to abuse this may get widened by the change here -(depending on guest and host configuration), but will be taken care of -by the other XSA. - -This is XSA-338. - -Fixes: 48974e6ce52e ("evtchn: use a per-domain variable for the max number of event channels") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Julien Grall <jgrall@amazon.com> ---- -v5: New, split from larger patch. - ---- a/xen/include/xen/event.h -+++ b/xen/include/xen/event.h -@@ -107,8 +107,6 @@ void notify_via_xen_event_channel(struct - - static inline bool_t port_is_valid(struct domain *d, unsigned int p) - { -- if ( p >= d->max_evtchns ) -- return 0; - return p < read_atomic(&d->valid_evtchns); - } - diff --git a/system/xen/xsa/xsa339.patch b/system/xen/xsa/xsa339.patch deleted file mode 100644 index 3311ae093f..0000000000 --- a/system/xen/xsa/xsa339.patch +++ /dev/null @@ -1,76 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/pv: Avoid double exception injection - -There is at least one path (SYSENTER with NT set, Xen converts to #GP) which -ends up injecting the #GP fault twice, first in compat_sysenter(), and then a -second time in compat_test_all_events(), due to the stale TBF_EXCEPTION left -in TRAPBOUNCE_flags. - -The guest kernel sees the second fault first, which is a kernel level #GP -pointing at the head of the #GP handler, and is therefore a userspace -trigger-able DoS. - -This particular bug has bitten us several times before, so rearrange -{compat_,}create_bounce_frame() to clobber TRAPBOUNCE on success, rather than -leaving this task to one area of code which isn't used uniformly. - -Other scenarios which might result in a double injection (e.g. two calls -directly to compat_create_bounce_frame) will now crash the guest, which is far -more obvious than letting the kernel run with corrupt state. - -This is XSA-339 - -Fixes: fdac9515607b ("x86: clear EFLAGS.NT in SYSENTER entry path") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index c3e62f8734..73619f57ca 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -78,7 +78,6 @@ compat_process_softirqs: - sti - .Lcompat_bounce_exception: - call compat_create_bounce_frame -- movb $0, TRAPBOUNCE_flags(%rdx) - jmp compat_test_all_events - - ALIGN -@@ -352,7 +351,13 @@ __UNLIKELY_END(compat_bounce_null_selector) - movl %eax,UREGS_cs+8(%rsp) - movl TRAPBOUNCE_eip(%rdx),%eax - movl %eax,UREGS_rip+8(%rsp) -+ -+ /* Trapbounce complete. Clobber state to avoid an erroneous second injection. */ -+ xor %eax, %eax -+ mov %ax, TRAPBOUNCE_cs(%rdx) -+ mov %al, TRAPBOUNCE_flags(%rdx) - ret -+ - .section .fixup,"ax" - .Lfx13: - xorl %edi,%edi -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 1e880eb9f6..71a00e846b 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -90,7 +90,6 @@ process_softirqs: - sti - .Lbounce_exception: - call create_bounce_frame -- movb $0, TRAPBOUNCE_flags(%rdx) - jmp test_all_events - - ALIGN -@@ -512,6 +511,11 @@ UNLIKELY_START(z, create_bounce_frame_bad_bounce_ip) - jmp asm_domain_crash_synchronous /* Does not return */ - __UNLIKELY_END(create_bounce_frame_bad_bounce_ip) - movq %rax,UREGS_rip+8(%rsp) -+ -+ /* Trapbounce complete. Clobber state to avoid an erroneous second injection. */ -+ xor %eax, %eax -+ mov %rax, TRAPBOUNCE_eip(%rdx) -+ mov %al, TRAPBOUNCE_flags(%rdx) - ret - - .pushsection .fixup, "ax", @progbits diff --git a/system/xen/xsa/xsa340.patch b/system/xen/xsa/xsa340.patch deleted file mode 100644 index 38d04da465..0000000000 --- a/system/xen/xsa/xsa340.patch +++ /dev/null @@ -1,65 +0,0 @@ -From: Julien Grall <jgrall@amazon.com> -Subject: xen/evtchn: Add missing barriers when accessing/allocating an event channel - -While the allocation of a bucket is always performed with the per-domain -lock, the bucket may be accessed without the lock taken (for instance, see -evtchn_send()). - -Instead such sites relies on port_is_valid() to return a non-zero value -when the port has a struct evtchn associated to it. The function will -mostly check whether the port is less than d->valid_evtchns as all the -buckets/event channels should be allocated up to that point. - -Unfortunately a compiler is free to re-order the assignment in -evtchn_allocate_port() so it would be possible to have d->valid_evtchns -updated before the new bucket has finish to allocate. - -Additionally on Arm, even if this was compiled "correctly", the -processor can still re-order the memory access. - -Add a write memory barrier in the allocation side and a read memory -barrier when the port is valid to prevent any re-ordering issue. - -This is XSA-340. - -Reported-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> - ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -178,6 +178,13 @@ int evtchn_allocate_port(struct domain * - return -ENOMEM; - bucket_from_port(d, port) = chn; - -+ /* -+ * d->valid_evtchns is used to check whether the bucket can be -+ * accessed without the per-domain lock. Therefore, -+ * d->valid_evtchns should be seen *after* the new bucket has -+ * been setup. -+ */ -+ smp_wmb(); - write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); - } - ---- a/xen/include/xen/event.h -+++ b/xen/include/xen/event.h -@@ -107,7 +107,17 @@ void notify_via_xen_event_channel(struct - - static inline bool_t port_is_valid(struct domain *d, unsigned int p) - { -- return p < read_atomic(&d->valid_evtchns); -+ if ( p >= read_atomic(&d->valid_evtchns) ) -+ return false; -+ -+ /* -+ * The caller will usually access the event channel afterwards and -+ * may be done without taking the per-domain lock. The barrier is -+ * going in pair the smp_wmb() barrier in evtchn_allocate_port(). -+ */ -+ smp_rmb(); -+ -+ return true; - } - - static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p) diff --git a/system/xen/xsa/xsa342-4.13.patch b/system/xen/xsa/xsa342-4.13.patch deleted file mode 100644 index 334baf1b69..0000000000 --- a/system/xen/xsa/xsa342-4.13.patch +++ /dev/null @@ -1,145 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: evtchn/x86: enforce correct upper limit for 32-bit guests - -The recording of d->max_evtchns in evtchn_2l_init(), in particular with -the limited set of callers of the function, is insufficient. Neither for -PV nor for HVM guests the bitness is known at domain_create() time, yet -the upper bound in 2-level mode depends upon guest bitness. Recording -too high a limit "allows" x86 32-bit domains to open not properly usable -event channels, management of which (inside Xen) would then result in -corruption of the shared info and vCPU info structures. - -Keep the upper limit dynamic for the 2-level case, introducing a helper -function to retrieve the effective limit. This helper is now supposed to -be private to the event channel code. The used in do_poll() and -domain_dump_evtchn_info() weren't consistent with port uses elsewhere -and hence get switched to port_is_valid(). - -Furthermore FIFO mode's setup_ports() gets adjusted to loop only up to -the prior ABI limit, rather than all the way up to the new one. - -Finally a word on the change to do_poll(): Accessing ->max_evtchns -without holding a suitable lock was never safe, as it as well as -->evtchn_port_ops may change behind do_poll()'s back. Using -port_is_valid() instead widens some the window for potential abuse, -until we've dealt with the race altogether (see XSA-343). - -This is XSA-342. - -Reported-by: Julien Grall <jgrall@amazon.com> -Fixes: 48974e6ce52e ("evtchn: use a per-domain variable for the max number of event channels") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Julien Grall <jgrall@amazon.com> - ---- a/xen/common/event_2l.c -+++ b/xen/common/event_2l.c -@@ -103,7 +103,6 @@ static const struct evtchn_port_ops evtc - void evtchn_2l_init(struct domain *d) - { - d->evtchn_port_ops = &evtchn_port_ops_2l; -- d->max_evtchns = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); - } - - /* ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -151,7 +151,7 @@ static void free_evtchn_bucket(struct do - - int evtchn_allocate_port(struct domain *d, evtchn_port_t port) - { -- if ( port > d->max_evtchn_port || port >= d->max_evtchns ) -+ if ( port > d->max_evtchn_port || port >= max_evtchns(d) ) - return -ENOSPC; - - if ( port_is_valid(d, port) ) -@@ -1396,13 +1396,11 @@ static void domain_dump_evtchn_info(stru - - spin_lock(&d->event_lock); - -- for ( port = 1; port < d->max_evtchns; ++port ) -+ for ( port = 1; port_is_valid(d, port); ++port ) - { - const struct evtchn *chn; - char *ssid; - -- if ( !port_is_valid(d, port) ) -- continue; - chn = evtchn_from_port(d, port); - if ( chn->state == ECS_FREE ) - continue; ---- a/xen/common/event_fifo.c -+++ b/xen/common/event_fifo.c -@@ -478,7 +478,7 @@ static void cleanup_event_array(struct d - d->evtchn_fifo = NULL; - } - --static void setup_ports(struct domain *d) -+static void setup_ports(struct domain *d, unsigned int prev_evtchns) - { - unsigned int port; - -@@ -488,7 +488,7 @@ static void setup_ports(struct domain *d - * - save its pending state. - * - set default priority. - */ -- for ( port = 1; port < d->max_evtchns; port++ ) -+ for ( port = 1; port < prev_evtchns; port++ ) - { - struct evtchn *evtchn; - -@@ -546,6 +546,8 @@ int evtchn_fifo_init_control(struct evtc - if ( !d->evtchn_fifo ) - { - struct vcpu *vcb; -+ /* Latch the value before it changes during setup_event_array(). */ -+ unsigned int prev_evtchns = max_evtchns(d); - - for_each_vcpu ( d, vcb ) { - rc = setup_control_block(vcb); -@@ -562,8 +564,7 @@ int evtchn_fifo_init_control(struct evtc - goto error; - - d->evtchn_port_ops = &evtchn_port_ops_fifo; -- d->max_evtchns = EVTCHN_FIFO_NR_CHANNELS; -- setup_ports(d); -+ setup_ports(d, prev_evtchns); - } - else - rc = map_control_block(v, gfn, offset); ---- a/xen/common/schedule.c -+++ b/xen/common/schedule.c -@@ -1434,7 +1434,7 @@ static long do_poll(struct sched_poll *s - goto out; - - rc = -EINVAL; -- if ( port >= d->max_evtchns ) -+ if ( !port_is_valid(d, port) ) - goto out; - - rc = 0; ---- a/xen/include/xen/event.h -+++ b/xen/include/xen/event.h -@@ -105,6 +105,12 @@ void notify_via_xen_event_channel(struct - #define bucket_from_port(d, p) \ - ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) - -+static inline unsigned int max_evtchns(const struct domain *d) -+{ -+ return d->evtchn_fifo ? EVTCHN_FIFO_NR_CHANNELS -+ : BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); -+} -+ - static inline bool_t port_is_valid(struct domain *d, unsigned int p) - { - if ( p >= read_atomic(&d->valid_evtchns) ) ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -382,7 +382,6 @@ struct domain - /* Event channel information. */ - struct evtchn *evtchn; /* first bucket only */ - struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */ -- unsigned int max_evtchns; /* number supported by ABI */ - unsigned int max_evtchn_port; /* max permitted port number */ - unsigned int valid_evtchns; /* number of allocated event channels */ - spinlock_t event_lock; diff --git a/system/xen/xsa/xsa343-1.patch b/system/xen/xsa/xsa343-1.patch deleted file mode 100644 index 0abbc03e8d..0000000000 --- a/system/xen/xsa/xsa343-1.patch +++ /dev/null @@ -1,199 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: evtchn: evtchn_reset() shouldn't succeed with still-open ports - -While the function closes all ports, it does so without holding any -lock, and hence racing requests may be issued causing new ports to get -opened. This would have been problematic in particular if such a newly -opened port had a port number above the new implementation limit (i.e. -when switching from FIFO to 2-level) after the reset, as prior to -"evtchn: relax port_is_valid()" this could have led to e.g. -evtchn_close()'s "BUG_ON(!port_is_valid(d2, port2))" to trigger. - -Introduce a counter of active ports and check that it's (still) no -larger then the number of Xen internally used ones after obtaining the -necessary lock in evtchn_reset(). - -As to the access model of the new {active,xen}_evtchns fields - while -all writes get done using write_atomic(), reads ought to use -read_atomic() only when outside of a suitably locked region. - -Note that as of now evtchn_bind_virq() and evtchn_bind_ipi() don't have -a need to call check_free_port(). - -This is part of XSA-343. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Julien Grall <jgrall@amazon.com> ---- -v7: Drop optimization from evtchn_reset(). -v6: Fix loop exit condition in evtchn_reset(). Use {read,write}_atomic() - also for xen_evtchns. -v5: Move increment in alloc_unbound_xen_event_channel() out of the inner - locked region. -v4: Account for Xen internal ports. -v3: Document intended access next to new struct field. -v2: Add comment to check_free_port(). Drop commented out calls. - ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -188,6 +188,8 @@ int evtchn_allocate_port(struct domain * - write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); - } - -+ write_atomic(&d->active_evtchns, d->active_evtchns + 1); -+ - return 0; - } - -@@ -211,11 +213,26 @@ static int get_free_port(struct domain * - return -ENOSPC; - } - -+/* -+ * Check whether a port is still marked free, and if so update the domain -+ * counter accordingly. To be used on function exit paths. -+ */ -+static void check_free_port(struct domain *d, evtchn_port_t port) -+{ -+ if ( port_is_valid(d, port) && -+ evtchn_from_port(d, port)->state == ECS_FREE ) -+ write_atomic(&d->active_evtchns, d->active_evtchns - 1); -+} -+ - void evtchn_free(struct domain *d, struct evtchn *chn) - { - /* Clear pending event to avoid unexpected behavior on re-bind. */ - evtchn_port_clear_pending(d, chn); - -+ if ( consumer_is_xen(chn) ) -+ write_atomic(&d->xen_evtchns, d->xen_evtchns - 1); -+ write_atomic(&d->active_evtchns, d->active_evtchns - 1); -+ - /* Reset binding to vcpu0 when the channel is freed. */ - chn->state = ECS_FREE; - chn->notify_vcpu_id = 0; -@@ -258,6 +275,7 @@ static long evtchn_alloc_unbound(evtchn_ - alloc->port = port; - - out: -+ check_free_port(d, port); - spin_unlock(&d->event_lock); - rcu_unlock_domain(d); - -@@ -351,6 +369,7 @@ static long evtchn_bind_interdomain(evtc - bind->local_port = lport; - - out: -+ check_free_port(ld, lport); - spin_unlock(&ld->event_lock); - if ( ld != rd ) - spin_unlock(&rd->event_lock); -@@ -488,7 +507,7 @@ static long evtchn_bind_pirq(evtchn_bind - struct domain *d = current->domain; - struct vcpu *v = d->vcpu[0]; - struct pirq *info; -- int port, pirq = bind->pirq; -+ int port = 0, pirq = bind->pirq; - long rc; - - if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) -@@ -536,6 +555,7 @@ static long evtchn_bind_pirq(evtchn_bind - arch_evtchn_bind_pirq(d, pirq); - - out: -+ check_free_port(d, port); - spin_unlock(&d->event_lock); - - return rc; -@@ -1011,10 +1031,10 @@ int evtchn_unmask(unsigned int port) - return 0; - } - -- - int evtchn_reset(struct domain *d) - { - unsigned int i; -+ int rc = 0; - - if ( d != current->domain && !d->controller_pause_count ) - return -EINVAL; -@@ -1024,7 +1044,9 @@ int evtchn_reset(struct domain *d) - - spin_lock(&d->event_lock); - -- if ( d->evtchn_fifo ) -+ if ( d->active_evtchns > d->xen_evtchns ) -+ rc = -EAGAIN; -+ else if ( d->evtchn_fifo ) - { - /* Switching back to 2-level ABI. */ - evtchn_fifo_destroy(d); -@@ -1033,7 +1055,7 @@ int evtchn_reset(struct domain *d) - - spin_unlock(&d->event_lock); - -- return 0; -+ return rc; - } - - static long evtchn_set_priority(const struct evtchn_set_priority *set_priority) -@@ -1219,10 +1241,9 @@ int alloc_unbound_xen_event_channel( - - spin_lock(&ld->event_lock); - -- rc = get_free_port(ld); -+ port = rc = get_free_port(ld); - if ( rc < 0 ) - goto out; -- port = rc; - chn = evtchn_from_port(ld, port); - - rc = xsm_evtchn_unbound(XSM_TARGET, ld, chn, remote_domid); -@@ -1238,7 +1259,10 @@ int alloc_unbound_xen_event_channel( - - spin_unlock(&chn->lock); - -+ write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1); -+ - out: -+ check_free_port(ld, port); - spin_unlock(&ld->event_lock); - - return rc < 0 ? rc : port; -@@ -1314,6 +1338,7 @@ int evtchn_init(struct domain *d, unsign - return -EINVAL; - } - evtchn_from_port(d, 0)->state = ECS_RESERVED; -+ write_atomic(&d->active_evtchns, 0); - - #if MAX_VIRT_CPUS > BITS_PER_LONG - d->poll_mask = xzalloc_array(unsigned long, BITS_TO_LONGS(d->max_vcpus)); -@@ -1340,6 +1365,8 @@ void evtchn_destroy(struct domain *d) - for ( i = 0; port_is_valid(d, i); i++ ) - evtchn_close(d, i, 0); - -+ ASSERT(!d->active_evtchns); -+ - clear_global_virq_handlers(d); - - evtchn_fifo_destroy(d); ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -361,6 +361,16 @@ struct domain - struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */ - unsigned int max_evtchn_port; /* max permitted port number */ - unsigned int valid_evtchns; /* number of allocated event channels */ -+ /* -+ * Number of in-use event channels. Writers should use write_atomic(). -+ * Readers need to use read_atomic() only when not holding event_lock. -+ */ -+ unsigned int active_evtchns; -+ /* -+ * Number of event channels used internally by Xen (not subject to -+ * EVTCHNOP_reset). Read/write access like for active_evtchns. -+ */ -+ unsigned int xen_evtchns; - spinlock_t event_lock; - const struct evtchn_port_ops *evtchn_port_ops; - struct evtchn_fifo_domain *evtchn_fifo; diff --git a/system/xen/xsa/xsa343-2.patch b/system/xen/xsa/xsa343-2.patch deleted file mode 100644 index b8eb4998f1..0000000000 --- a/system/xen/xsa/xsa343-2.patch +++ /dev/null @@ -1,295 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: evtchn: convert per-channel lock to be IRQ-safe - -... in order for send_guest_{global,vcpu}_virq() to be able to make use -of it. - -This is part of XSA-343. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> ---- -v6: New. ---- -TBD: This is the "dumb" conversion variant. In a couple of cases the - slightly simpler spin_{,un}lock_irq() could apparently be used. - ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -248,6 +248,7 @@ static long evtchn_alloc_unbound(evtchn_ - int port; - domid_t dom = alloc->dom; - long rc; -+ unsigned long flags; - - d = rcu_lock_domain_by_any_id(dom); - if ( d == NULL ) -@@ -263,14 +264,14 @@ static long evtchn_alloc_unbound(evtchn_ - if ( rc ) - goto out; - -- spin_lock(&chn->lock); -+ spin_lock_irqsave(&chn->lock, flags); - - chn->state = ECS_UNBOUND; - if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF ) - chn->u.unbound.remote_domid = current->domain->domain_id; - evtchn_port_init(d, chn); - -- spin_unlock(&chn->lock); -+ spin_unlock_irqrestore(&chn->lock, flags); - - alloc->port = port; - -@@ -283,26 +284,32 @@ static long evtchn_alloc_unbound(evtchn_ - } - - --static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) -+static unsigned long double_evtchn_lock(struct evtchn *lchn, -+ struct evtchn *rchn) - { -- if ( lchn < rchn ) -+ unsigned long flags; -+ -+ if ( lchn <= rchn ) - { -- spin_lock(&lchn->lock); -- spin_lock(&rchn->lock); -+ spin_lock_irqsave(&lchn->lock, flags); -+ if ( lchn != rchn ) -+ spin_lock(&rchn->lock); - } - else - { -- if ( lchn != rchn ) -- spin_lock(&rchn->lock); -+ spin_lock_irqsave(&rchn->lock, flags); - spin_lock(&lchn->lock); - } -+ -+ return flags; - } - --static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn) -+static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn, -+ unsigned long flags) - { -- spin_unlock(&lchn->lock); - if ( lchn != rchn ) -- spin_unlock(&rchn->lock); -+ spin_unlock(&lchn->lock); -+ spin_unlock_irqrestore(&rchn->lock, flags); - } - - static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind) -@@ -312,6 +319,7 @@ static long evtchn_bind_interdomain(evtc - int lport, rport = bind->remote_port; - domid_t rdom = bind->remote_dom; - long rc; -+ unsigned long flags; - - if ( rdom == DOMID_SELF ) - rdom = current->domain->domain_id; -@@ -347,7 +355,7 @@ static long evtchn_bind_interdomain(evtc - if ( rc ) - goto out; - -- double_evtchn_lock(lchn, rchn); -+ flags = double_evtchn_lock(lchn, rchn); - - lchn->u.interdomain.remote_dom = rd; - lchn->u.interdomain.remote_port = rport; -@@ -364,7 +372,7 @@ static long evtchn_bind_interdomain(evtc - */ - evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn); - -- double_evtchn_unlock(lchn, rchn); -+ double_evtchn_unlock(lchn, rchn, flags); - - bind->local_port = lport; - -@@ -387,6 +395,7 @@ int evtchn_bind_virq(evtchn_bind_virq_t - struct domain *d = current->domain; - int virq = bind->virq, vcpu = bind->vcpu; - int rc = 0; -+ unsigned long flags; - - if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) ) - return -EINVAL; -@@ -424,14 +433,14 @@ int evtchn_bind_virq(evtchn_bind_virq_t - - chn = evtchn_from_port(d, port); - -- spin_lock(&chn->lock); -+ spin_lock_irqsave(&chn->lock, flags); - - chn->state = ECS_VIRQ; - chn->notify_vcpu_id = vcpu; - chn->u.virq = virq; - evtchn_port_init(d, chn); - -- spin_unlock(&chn->lock); -+ spin_unlock_irqrestore(&chn->lock, flags); - - v->virq_to_evtchn[virq] = bind->port = port; - -@@ -448,6 +457,7 @@ static long evtchn_bind_ipi(evtchn_bind_ - struct domain *d = current->domain; - int port, vcpu = bind->vcpu; - long rc = 0; -+ unsigned long flags; - - if ( domain_vcpu(d, vcpu) == NULL ) - return -ENOENT; -@@ -459,13 +469,13 @@ static long evtchn_bind_ipi(evtchn_bind_ - - chn = evtchn_from_port(d, port); - -- spin_lock(&chn->lock); -+ spin_lock_irqsave(&chn->lock, flags); - - chn->state = ECS_IPI; - chn->notify_vcpu_id = vcpu; - evtchn_port_init(d, chn); - -- spin_unlock(&chn->lock); -+ spin_unlock_irqrestore(&chn->lock, flags); - - bind->port = port; - -@@ -509,6 +519,7 @@ static long evtchn_bind_pirq(evtchn_bind - struct pirq *info; - int port = 0, pirq = bind->pirq; - long rc; -+ unsigned long flags; - - if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) - return -EINVAL; -@@ -541,14 +552,14 @@ static long evtchn_bind_pirq(evtchn_bind - goto out; - } - -- spin_lock(&chn->lock); -+ spin_lock_irqsave(&chn->lock, flags); - - chn->state = ECS_PIRQ; - chn->u.pirq.irq = pirq; - link_pirq_port(port, chn, v); - evtchn_port_init(d, chn); - -- spin_unlock(&chn->lock); -+ spin_unlock_irqrestore(&chn->lock, flags); - - bind->port = port; - -@@ -569,6 +580,7 @@ int evtchn_close(struct domain *d1, int - struct evtchn *chn1, *chn2; - int port2; - long rc = 0; -+ unsigned long flags; - - again: - spin_lock(&d1->event_lock); -@@ -668,14 +680,14 @@ int evtchn_close(struct domain *d1, int - BUG_ON(chn2->state != ECS_INTERDOMAIN); - BUG_ON(chn2->u.interdomain.remote_dom != d1); - -- double_evtchn_lock(chn1, chn2); -+ flags = double_evtchn_lock(chn1, chn2); - - evtchn_free(d1, chn1); - - chn2->state = ECS_UNBOUND; - chn2->u.unbound.remote_domid = d1->domain_id; - -- double_evtchn_unlock(chn1, chn2); -+ double_evtchn_unlock(chn1, chn2, flags); - - goto out; - -@@ -683,9 +695,9 @@ int evtchn_close(struct domain *d1, int - BUG(); - } - -- spin_lock(&chn1->lock); -+ spin_lock_irqsave(&chn1->lock, flags); - evtchn_free(d1, chn1); -- spin_unlock(&chn1->lock); -+ spin_unlock_irqrestore(&chn1->lock, flags); - - out: - if ( d2 != NULL ) -@@ -705,13 +717,14 @@ int evtchn_send(struct domain *ld, unsig - struct evtchn *lchn, *rchn; - struct domain *rd; - int rport, ret = 0; -+ unsigned long flags; - - if ( !port_is_valid(ld, lport) ) - return -EINVAL; - - lchn = evtchn_from_port(ld, lport); - -- spin_lock(&lchn->lock); -+ spin_lock_irqsave(&lchn->lock, flags); - - /* Guest cannot send via a Xen-attached event channel. */ - if ( unlikely(consumer_is_xen(lchn)) ) -@@ -746,7 +759,7 @@ int evtchn_send(struct domain *ld, unsig - } - - out: -- spin_unlock(&lchn->lock); -+ spin_unlock_irqrestore(&lchn->lock, flags); - - return ret; - } -@@ -1238,6 +1251,7 @@ int alloc_unbound_xen_event_channel( - { - struct evtchn *chn; - int port, rc; -+ unsigned long flags; - - spin_lock(&ld->event_lock); - -@@ -1250,14 +1264,14 @@ int alloc_unbound_xen_event_channel( - if ( rc ) - goto out; - -- spin_lock(&chn->lock); -+ spin_lock_irqsave(&chn->lock, flags); - - chn->state = ECS_UNBOUND; - chn->xen_consumer = get_xen_consumer(notification_fn); - chn->notify_vcpu_id = lvcpu; - chn->u.unbound.remote_domid = remote_domid; - -- spin_unlock(&chn->lock); -+ spin_unlock_irqrestore(&chn->lock, flags); - - write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1); - -@@ -1280,11 +1294,12 @@ void notify_via_xen_event_channel(struct - { - struct evtchn *lchn, *rchn; - struct domain *rd; -+ unsigned long flags; - - ASSERT(port_is_valid(ld, lport)); - lchn = evtchn_from_port(ld, lport); - -- spin_lock(&lchn->lock); -+ spin_lock_irqsave(&lchn->lock, flags); - - if ( likely(lchn->state == ECS_INTERDOMAIN) ) - { -@@ -1294,7 +1309,7 @@ void notify_via_xen_event_channel(struct - evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn); - } - -- spin_unlock(&lchn->lock); -+ spin_unlock_irqrestore(&lchn->lock, flags); - } - - void evtchn_check_pollers(struct domain *d, unsigned int port) diff --git a/system/xen/xsa/xsa343-3.patch b/system/xen/xsa/xsa343-3.patch deleted file mode 100644 index e513e308eb..0000000000 --- a/system/xen/xsa/xsa343-3.patch +++ /dev/null @@ -1,392 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: evtchn: address races with evtchn_reset() - -Neither d->evtchn_port_ops nor max_evtchns(d) may be used in an entirely -lock-less manner, as both may change by a racing evtchn_reset(). In the -common case, at least one of the domain's event lock or the per-channel -lock needs to be held. In the specific case of the inter-domain sending -by evtchn_send() and notify_via_xen_event_channel() holding the other -side's per-channel lock is sufficient, as the channel can't change state -without both per-channel locks held. Without such a channel changing -state, evtchn_reset() can't complete successfully. - -Lock-free accesses continue to be permitted for the shim (calling some -otherwise internal event channel functions), as this happens while the -domain is in effectively single-threaded mode. Special care also needs -taking for the shim's marking of in-use ports as ECS_RESERVED (allowing -use of such ports in the shim case is okay because switching into and -hence also out of FIFO mode is impossihble there). - -As a side effect, certain operations on Xen bound event channels which -were mistakenly permitted so far (e.g. unmask or poll) will be refused -now. - -This is part of XSA-343. - -Reported-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> ---- -v9: Add arch_evtchn_is_special() to fix PV shim. -v8: Add BUILD_BUG_ON() in evtchn_usable(). -v7: Add locking related comment ahead of struct evtchn_port_ops. -v6: New. ---- -TBD: I've been considering to move some of the wrappers from xen/event.h - into event_channel.c (or even drop them altogether), when they - require external locking (e.g. evtchn_port_init() or - evtchn_port_set_priority()). Does anyone have a strong opinion - either way? - ---- a/xen/arch/x86/irq.c -+++ b/xen/arch/x86/irq.c -@@ -2488,14 +2488,24 @@ static void dump_irqs(unsigned char key) - - for ( i = 0; i < action->nr_guests; ) - { -+ struct evtchn *evtchn; -+ unsigned int pending = 2, masked = 2; -+ - d = action->guest[i++]; - pirq = domain_irq_to_pirq(d, irq); - info = pirq_info(d, pirq); -+ evtchn = evtchn_from_port(d, info->evtchn); -+ local_irq_disable(); -+ if ( spin_trylock(&evtchn->lock) ) -+ { -+ pending = evtchn_is_pending(d, evtchn); -+ masked = evtchn_is_masked(d, evtchn); -+ spin_unlock(&evtchn->lock); -+ } -+ local_irq_enable(); - printk("d%d:%3d(%c%c%c)%c", -- d->domain_id, pirq, -- evtchn_port_is_pending(d, info->evtchn) ? 'P' : '-', -- evtchn_port_is_masked(d, info->evtchn) ? 'M' : '-', -- info->masked ? 'M' : '-', -+ d->domain_id, pirq, "-P?"[pending], -+ "-M?"[masked], info->masked ? 'M' : '-', - i < action->nr_guests ? ',' : '\n'); - } - } ---- a/xen/arch/x86/pv/shim.c -+++ b/xen/arch/x86/pv/shim.c -@@ -660,8 +660,11 @@ void pv_shim_inject_evtchn(unsigned int - if ( port_is_valid(guest, port) ) - { - struct evtchn *chn = evtchn_from_port(guest, port); -+ unsigned long flags; - -+ spin_lock_irqsave(&chn->lock, flags); - evtchn_port_set_pending(guest, chn->notify_vcpu_id, chn); -+ spin_unlock_irqrestore(&chn->lock, flags); - } - } - ---- a/xen/common/event_2l.c -+++ b/xen/common/event_2l.c -@@ -63,8 +63,10 @@ static void evtchn_2l_unmask(struct doma - } - } - --static bool evtchn_2l_is_pending(const struct domain *d, evtchn_port_t port) -+static bool evtchn_2l_is_pending(const struct domain *d, -+ const struct evtchn *evtchn) - { -+ evtchn_port_t port = evtchn->port; - unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); - - ASSERT(port < max_ports); -@@ -72,8 +74,10 @@ static bool evtchn_2l_is_pending(const s - guest_test_bit(d, port, &shared_info(d, evtchn_pending))); - } - --static bool evtchn_2l_is_masked(const struct domain *d, evtchn_port_t port) -+static bool evtchn_2l_is_masked(const struct domain *d, -+ const struct evtchn *evtchn) - { -+ evtchn_port_t port = evtchn->port; - unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); - - ASSERT(port < max_ports); ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -156,8 +156,9 @@ int evtchn_allocate_port(struct domain * - - if ( port_is_valid(d, port) ) - { -- if ( evtchn_from_port(d, port)->state != ECS_FREE || -- evtchn_port_is_busy(d, port) ) -+ const struct evtchn *chn = evtchn_from_port(d, port); -+ -+ if ( chn->state != ECS_FREE || evtchn_is_busy(d, chn) ) - return -EBUSY; - } - else -@@ -774,6 +775,7 @@ void send_guest_vcpu_virq(struct vcpu *v - unsigned long flags; - int port; - struct domain *d; -+ struct evtchn *chn; - - ASSERT(!virq_is_global(virq)); - -@@ -784,7 +786,10 @@ void send_guest_vcpu_virq(struct vcpu *v - goto out; - - d = v->domain; -- evtchn_port_set_pending(d, v->vcpu_id, evtchn_from_port(d, port)); -+ chn = evtchn_from_port(d, port); -+ spin_lock(&chn->lock); -+ evtchn_port_set_pending(d, v->vcpu_id, chn); -+ spin_unlock(&chn->lock); - - out: - spin_unlock_irqrestore(&v->virq_lock, flags); -@@ -813,7 +818,9 @@ void send_guest_global_virq(struct domai - goto out; - - chn = evtchn_from_port(d, port); -+ spin_lock(&chn->lock); - evtchn_port_set_pending(d, chn->notify_vcpu_id, chn); -+ spin_unlock(&chn->lock); - - out: - spin_unlock_irqrestore(&v->virq_lock, flags); -@@ -823,6 +830,7 @@ void send_guest_pirq(struct domain *d, c - { - int port; - struct evtchn *chn; -+ unsigned long flags; - - /* - * PV guests: It should not be possible to race with __evtchn_close(). The -@@ -837,7 +845,9 @@ void send_guest_pirq(struct domain *d, c - } - - chn = evtchn_from_port(d, port); -+ spin_lock_irqsave(&chn->lock, flags); - evtchn_port_set_pending(d, chn->notify_vcpu_id, chn); -+ spin_unlock_irqrestore(&chn->lock, flags); - } - - static struct domain *global_virq_handlers[NR_VIRQS] __read_mostly; -@@ -1034,12 +1044,15 @@ int evtchn_unmask(unsigned int port) - { - struct domain *d = current->domain; - struct evtchn *evtchn; -+ unsigned long flags; - - if ( unlikely(!port_is_valid(d, port)) ) - return -EINVAL; - - evtchn = evtchn_from_port(d, port); -+ spin_lock_irqsave(&evtchn->lock, flags); - evtchn_port_unmask(d, evtchn); -+ spin_unlock_irqrestore(&evtchn->lock, flags); - - return 0; - } -@@ -1449,8 +1462,8 @@ static void domain_dump_evtchn_info(stru - - printk(" %4u [%d/%d/", - port, -- evtchn_port_is_pending(d, port), -- evtchn_port_is_masked(d, port)); -+ evtchn_is_pending(d, chn), -+ evtchn_is_masked(d, chn)); - evtchn_port_print_state(d, chn); - printk("]: s=%d n=%d x=%d", - chn->state, chn->notify_vcpu_id, chn->xen_consumer); ---- a/xen/common/event_fifo.c -+++ b/xen/common/event_fifo.c -@@ -296,23 +296,26 @@ static void evtchn_fifo_unmask(struct do - evtchn_fifo_set_pending(v, evtchn); - } - --static bool evtchn_fifo_is_pending(const struct domain *d, evtchn_port_t port) -+static bool evtchn_fifo_is_pending(const struct domain *d, -+ const struct evtchn *evtchn) - { -- const event_word_t *word = evtchn_fifo_word_from_port(d, port); -+ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port); - - return word && guest_test_bit(d, EVTCHN_FIFO_PENDING, word); - } - --static bool_t evtchn_fifo_is_masked(const struct domain *d, evtchn_port_t port) -+static bool_t evtchn_fifo_is_masked(const struct domain *d, -+ const struct evtchn *evtchn) - { -- const event_word_t *word = evtchn_fifo_word_from_port(d, port); -+ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port); - - return !word || guest_test_bit(d, EVTCHN_FIFO_MASKED, word); - } - --static bool_t evtchn_fifo_is_busy(const struct domain *d, evtchn_port_t port) -+static bool_t evtchn_fifo_is_busy(const struct domain *d, -+ const struct evtchn *evtchn) - { -- const event_word_t *word = evtchn_fifo_word_from_port(d, port); -+ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port); - - return word && guest_test_bit(d, EVTCHN_FIFO_LINKED, word); - } ---- a/xen/include/asm-x86/event.h -+++ b/xen/include/asm-x86/event.h -@@ -47,4 +47,10 @@ static inline bool arch_virq_is_global(u - return true; - } - -+#ifdef CONFIG_PV_SHIM -+# include <asm/pv/shim.h> -+# define arch_evtchn_is_special(chn) \ -+ (pv_shim && (chn)->port && (chn)->state == ECS_RESERVED) -+#endif -+ - #endif ---- a/xen/include/xen/event.h -+++ b/xen/include/xen/event.h -@@ -133,6 +133,24 @@ static inline struct evtchn *evtchn_from - return bucket_from_port(d, p) + (p % EVTCHNS_PER_BUCKET); - } - -+/* -+ * "usable" as in "by a guest", i.e. Xen consumed channels are assumed to be -+ * taken care of separately where used for Xen's internal purposes. -+ */ -+static bool evtchn_usable(const struct evtchn *evtchn) -+{ -+ if ( evtchn->xen_consumer ) -+ return false; -+ -+#ifdef arch_evtchn_is_special -+ if ( arch_evtchn_is_special(evtchn) ) -+ return true; -+#endif -+ -+ BUILD_BUG_ON(ECS_FREE > ECS_RESERVED); -+ return evtchn->state > ECS_RESERVED; -+} -+ - /* Wait on a Xen-attached event channel. */ - #define wait_on_xen_event_channel(port, condition) \ - do { \ -@@ -165,19 +183,24 @@ int evtchn_reset(struct domain *d); - - /* - * Low-level event channel port ops. -+ * -+ * All hooks have to be called with a lock held which prevents the channel -+ * from changing state. This may be the domain event lock, the per-channel -+ * lock, or in the case of sending interdomain events also the other side's -+ * per-channel lock. Exceptions apply in certain cases for the PV shim. - */ - struct evtchn_port_ops { - void (*init)(struct domain *d, struct evtchn *evtchn); - void (*set_pending)(struct vcpu *v, struct evtchn *evtchn); - void (*clear_pending)(struct domain *d, struct evtchn *evtchn); - void (*unmask)(struct domain *d, struct evtchn *evtchn); -- bool (*is_pending)(const struct domain *d, evtchn_port_t port); -- bool (*is_masked)(const struct domain *d, evtchn_port_t port); -+ bool (*is_pending)(const struct domain *d, const struct evtchn *evtchn); -+ bool (*is_masked)(const struct domain *d, const struct evtchn *evtchn); - /* - * Is the port unavailable because it's still being cleaned up - * after being closed? - */ -- bool (*is_busy)(const struct domain *d, evtchn_port_t port); -+ bool (*is_busy)(const struct domain *d, const struct evtchn *evtchn); - int (*set_priority)(struct domain *d, struct evtchn *evtchn, - unsigned int priority); - void (*print_state)(struct domain *d, const struct evtchn *evtchn); -@@ -193,38 +216,67 @@ static inline void evtchn_port_set_pendi - unsigned int vcpu_id, - struct evtchn *evtchn) - { -- d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn); -+ if ( evtchn_usable(evtchn) ) -+ d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn); - } - - static inline void evtchn_port_clear_pending(struct domain *d, - struct evtchn *evtchn) - { -- d->evtchn_port_ops->clear_pending(d, evtchn); -+ if ( evtchn_usable(evtchn) ) -+ d->evtchn_port_ops->clear_pending(d, evtchn); - } - - static inline void evtchn_port_unmask(struct domain *d, - struct evtchn *evtchn) - { -- d->evtchn_port_ops->unmask(d, evtchn); -+ if ( evtchn_usable(evtchn) ) -+ d->evtchn_port_ops->unmask(d, evtchn); - } - --static inline bool evtchn_port_is_pending(const struct domain *d, -- evtchn_port_t port) -+static inline bool evtchn_is_pending(const struct domain *d, -+ const struct evtchn *evtchn) - { -- return d->evtchn_port_ops->is_pending(d, port); -+ return evtchn_usable(evtchn) && d->evtchn_port_ops->is_pending(d, evtchn); - } - --static inline bool evtchn_port_is_masked(const struct domain *d, -- evtchn_port_t port) -+static inline bool evtchn_port_is_pending(struct domain *d, evtchn_port_t port) - { -- return d->evtchn_port_ops->is_masked(d, port); -+ struct evtchn *evtchn = evtchn_from_port(d, port); -+ bool rc; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&evtchn->lock, flags); -+ rc = evtchn_is_pending(d, evtchn); -+ spin_unlock_irqrestore(&evtchn->lock, flags); -+ -+ return rc; -+} -+ -+static inline bool evtchn_is_masked(const struct domain *d, -+ const struct evtchn *evtchn) -+{ -+ return !evtchn_usable(evtchn) || d->evtchn_port_ops->is_masked(d, evtchn); -+} -+ -+static inline bool evtchn_port_is_masked(struct domain *d, evtchn_port_t port) -+{ -+ struct evtchn *evtchn = evtchn_from_port(d, port); -+ bool rc; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&evtchn->lock, flags); -+ rc = evtchn_is_masked(d, evtchn); -+ spin_unlock_irqrestore(&evtchn->lock, flags); -+ -+ return rc; - } - --static inline bool evtchn_port_is_busy(const struct domain *d, -- evtchn_port_t port) -+static inline bool evtchn_is_busy(const struct domain *d, -+ const struct evtchn *evtchn) - { - return d->evtchn_port_ops->is_busy && -- d->evtchn_port_ops->is_busy(d, port); -+ d->evtchn_port_ops->is_busy(d, evtchn); - } - - static inline int evtchn_port_set_priority(struct domain *d, -@@ -233,6 +285,8 @@ static inline int evtchn_port_set_priori - { - if ( !d->evtchn_port_ops->set_priority ) - return -ENOSYS; -+ if ( !evtchn_usable(evtchn) ) -+ return -EACCES; - return d->evtchn_port_ops->set_priority(d, evtchn, priority); - } - diff --git a/system/xen/xsa/xsa344-4.13-1.patch b/system/xen/xsa/xsa344-4.13-1.patch deleted file mode 100644 index d8e9b3f43f..0000000000 --- a/system/xen/xsa/xsa344-4.13-1.patch +++ /dev/null @@ -1,130 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: evtchn: arrange for preemption in evtchn_destroy() - -Especially closing of fully established interdomain channels can take -quite some time, due to the locking involved. Therefore we shouldn't -assume we can clean up still active ports all in one go. Besides adding -the necessary preemption check, also avoid pointlessly starting from -(or now really ending at) 0; 1 is the lowest numbered port which may -need closing. - -Since we're now reducing ->valid_evtchns, free_xen_event_channel(), -and (at least to be on the safe side) notify_via_xen_event_channel() -need to cope with attempts to close / unbind from / send through already -closed (and no longer valid, as per port_is_valid()) ports. - -This is part of XSA-344. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> - ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -770,12 +770,14 @@ int domain_kill(struct domain *d) - return domain_kill(d); - d->is_dying = DOMDYING_dying; - argo_destroy(d); -- evtchn_destroy(d); - gnttab_release_mappings(d); - vnuma_destroy(d->vnuma); - domain_set_outstanding_pages(d, 0); - /* fallthrough */ - case DOMDYING_dying: -+ rc = evtchn_destroy(d); -+ if ( rc ) -+ break; - rc = domain_relinquish_resources(d); - if ( rc != 0 ) - break; ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -1297,7 +1297,16 @@ int alloc_unbound_xen_event_channel( - - void free_xen_event_channel(struct domain *d, int port) - { -- BUG_ON(!port_is_valid(d, port)); -+ if ( !port_is_valid(d, port) ) -+ { -+ /* -+ * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing -+ * with the spin_barrier() and BUG_ON() in evtchn_destroy(). -+ */ -+ smp_rmb(); -+ BUG_ON(!d->is_dying); -+ return; -+ } - - evtchn_close(d, port, 0); - } -@@ -1309,7 +1318,17 @@ void notify_via_xen_event_channel(struct - struct domain *rd; - unsigned long flags; - -- ASSERT(port_is_valid(ld, lport)); -+ if ( !port_is_valid(ld, lport) ) -+ { -+ /* -+ * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing -+ * with the spin_barrier() and BUG_ON() in evtchn_destroy(). -+ */ -+ smp_rmb(); -+ ASSERT(ld->is_dying); -+ return; -+ } -+ - lchn = evtchn_from_port(ld, lport); - - spin_lock_irqsave(&lchn->lock, flags); -@@ -1380,8 +1399,7 @@ int evtchn_init(struct domain *d, unsign - return 0; - } - -- --void evtchn_destroy(struct domain *d) -+int evtchn_destroy(struct domain *d) - { - unsigned int i; - -@@ -1390,14 +1408,29 @@ void evtchn_destroy(struct domain *d) - spin_barrier(&d->event_lock); - - /* Close all existing event channels. */ -- for ( i = 0; port_is_valid(d, i); i++ ) -+ for ( i = d->valid_evtchns; --i; ) -+ { - evtchn_close(d, i, 0); - -+ /* -+ * Avoid preempting when called from domain_create()'s error path, -+ * and don't check too often (choice of frequency is arbitrary). -+ */ -+ if ( i && !(i & 0x3f) && d->is_dying != DOMDYING_dead && -+ hypercall_preempt_check() ) -+ { -+ write_atomic(&d->valid_evtchns, i); -+ return -ERESTART; -+ } -+ } -+ - ASSERT(!d->active_evtchns); - - clear_global_virq_handlers(d); - - evtchn_fifo_destroy(d); -+ -+ return 0; - } - - ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -136,7 +136,7 @@ struct evtchn - } __attribute__((aligned(64))); - - int evtchn_init(struct domain *d, unsigned int max_port); --void evtchn_destroy(struct domain *d); /* from domain_kill */ -+int evtchn_destroy(struct domain *d); /* from domain_kill */ - void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */ - - struct waitqueue_vcpu; diff --git a/system/xen/xsa/xsa344-4.13-2.patch b/system/xen/xsa/xsa344-4.13-2.patch deleted file mode 100644 index 3f0339498f..0000000000 --- a/system/xen/xsa/xsa344-4.13-2.patch +++ /dev/null @@ -1,203 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: evtchn: arrange for preemption in evtchn_reset() - -Like for evtchn_destroy() looping over all possible event channels to -close them can take a significant amount of time. Unlike done there, we -can't alter domain properties (i.e. d->valid_evtchns) here. Borrow, in a -lightweight form, the paging domctl continuation concept, redirecting -the continuations to different sub-ops. Just like there this is to be -able to allow for predictable overall results of the involved sub-ops: -Racing requests should either complete or be refused. - -Note that a domain can't interfere with an already started (by a remote -domain) reset, due to being paused. It can prevent a remote reset from -happening by leaving a reset unfinished, but that's only going to affect -itself. - -This is part of XSA-344. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> - ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -1214,7 +1214,7 @@ void domain_unpause_except_self(struct d - domain_unpause(d); - } - --int domain_soft_reset(struct domain *d) -+int domain_soft_reset(struct domain *d, bool resuming) - { - struct vcpu *v; - int rc; -@@ -1228,7 +1228,7 @@ int domain_soft_reset(struct domain *d) - } - spin_unlock(&d->shutdown_lock); - -- rc = evtchn_reset(d); -+ rc = evtchn_reset(d, resuming); - if ( rc ) - return rc; - ---- a/xen/common/domctl.c -+++ b/xen/common/domctl.c -@@ -572,12 +572,22 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe - } - - case XEN_DOMCTL_soft_reset: -+ case XEN_DOMCTL_soft_reset_cont: - if ( d == current->domain ) /* no domain_pause() */ - { - ret = -EINVAL; - break; - } -- ret = domain_soft_reset(d); -+ ret = domain_soft_reset(d, op->cmd == XEN_DOMCTL_soft_reset_cont); -+ if ( ret == -ERESTART ) -+ { -+ op->cmd = XEN_DOMCTL_soft_reset_cont; -+ if ( !__copy_field_to_guest(u_domctl, op, cmd) ) -+ ret = hypercall_create_continuation(__HYPERVISOR_domctl, -+ "h", u_domctl); -+ else -+ ret = -EFAULT; -+ } - break; - - case XEN_DOMCTL_destroydomain: ---- a/xen/common/event_channel.c -+++ b/xen/common/event_channel.c -@@ -1057,7 +1057,7 @@ int evtchn_unmask(unsigned int port) - return 0; - } - --int evtchn_reset(struct domain *d) -+int evtchn_reset(struct domain *d, bool resuming) - { - unsigned int i; - int rc = 0; -@@ -1065,11 +1065,40 @@ int evtchn_reset(struct domain *d) - if ( d != current->domain && !d->controller_pause_count ) - return -EINVAL; - -- for ( i = 0; port_is_valid(d, i); i++ ) -+ spin_lock(&d->event_lock); -+ -+ /* -+ * If we are resuming, then start where we stopped. Otherwise, check -+ * that a reset operation is not already in progress, and if none is, -+ * record that this is now the case. -+ */ -+ i = resuming ? d->next_evtchn : !d->next_evtchn; -+ if ( i > d->next_evtchn ) -+ d->next_evtchn = i; -+ -+ spin_unlock(&d->event_lock); -+ -+ if ( !i ) -+ return -EBUSY; -+ -+ for ( ; port_is_valid(d, i); i++ ) -+ { - evtchn_close(d, i, 1); - -+ /* NB: Choice of frequency is arbitrary. */ -+ if ( !(i & 0x3f) && hypercall_preempt_check() ) -+ { -+ spin_lock(&d->event_lock); -+ d->next_evtchn = i; -+ spin_unlock(&d->event_lock); -+ return -ERESTART; -+ } -+ } -+ - spin_lock(&d->event_lock); - -+ d->next_evtchn = 0; -+ - if ( d->active_evtchns > d->xen_evtchns ) - rc = -EAGAIN; - else if ( d->evtchn_fifo ) -@@ -1204,7 +1233,8 @@ long do_event_channel_op(int cmd, XEN_GU - break; - } - -- case EVTCHNOP_reset: { -+ case EVTCHNOP_reset: -+ case EVTCHNOP_reset_cont: { - struct evtchn_reset reset; - struct domain *d; - -@@ -1217,9 +1247,13 @@ long do_event_channel_op(int cmd, XEN_GU - - rc = xsm_evtchn_reset(XSM_TARGET, current->domain, d); - if ( !rc ) -- rc = evtchn_reset(d); -+ rc = evtchn_reset(d, cmd == EVTCHNOP_reset_cont); - - rcu_unlock_domain(d); -+ -+ if ( rc == -ERESTART ) -+ rc = hypercall_create_continuation(__HYPERVISOR_event_channel_op, -+ "ih", EVTCHNOP_reset_cont, arg); - break; - } - ---- a/xen/include/public/domctl.h -+++ b/xen/include/public/domctl.h -@@ -1152,7 +1152,10 @@ struct xen_domctl { - #define XEN_DOMCTL_iomem_permission 20 - #define XEN_DOMCTL_ioport_permission 21 - #define XEN_DOMCTL_hypercall_init 22 --#define XEN_DOMCTL_arch_setup 23 /* Obsolete IA64 only */ -+#ifdef __XEN__ -+/* #define XEN_DOMCTL_arch_setup 23 Obsolete IA64 only */ -+#define XEN_DOMCTL_soft_reset_cont 23 -+#endif - #define XEN_DOMCTL_settimeoffset 24 - #define XEN_DOMCTL_getvcpuaffinity 25 - #define XEN_DOMCTL_real_mode_area 26 /* Obsolete PPC only */ ---- a/xen/include/public/event_channel.h -+++ b/xen/include/public/event_channel.h -@@ -74,6 +74,9 @@ - #define EVTCHNOP_init_control 11 - #define EVTCHNOP_expand_array 12 - #define EVTCHNOP_set_priority 13 -+#ifdef __XEN__ -+#define EVTCHNOP_reset_cont 14 -+#endif - /* ` } */ - - typedef uint32_t evtchn_port_t; ---- a/xen/include/xen/event.h -+++ b/xen/include/xen/event.h -@@ -171,7 +171,7 @@ void evtchn_check_pollers(struct domain - void evtchn_2l_init(struct domain *d); - - /* Close all event channels and reset to 2-level ABI. */ --int evtchn_reset(struct domain *d); -+int evtchn_reset(struct domain *d, bool resuming); - - /* - * Low-level event channel port ops. ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -394,6 +394,8 @@ struct domain - * EVTCHNOP_reset). Read/write access like for active_evtchns. - */ - unsigned int xen_evtchns; -+ /* Port to resume from in evtchn_reset(), when in a continuation. */ -+ unsigned int next_evtchn; - spinlock_t event_lock; - const struct evtchn_port_ops *evtchn_port_ops; - struct evtchn_fifo_domain *evtchn_fifo; -@@ -663,7 +665,7 @@ int domain_shutdown(struct domain *d, u8 - void domain_resume(struct domain *d); - void domain_pause_for_debugger(void); - --int domain_soft_reset(struct domain *d); -+int domain_soft_reset(struct domain *d, bool resuming); - - int vcpu_start_shutdown_deferral(struct vcpu *v); - void vcpu_end_shutdown_deferral(struct vcpu *v); diff --git a/system/xen/xsa/xsa345-0001-x86-mm-Refactor-map_pages_to_xen-to-have-only-a-sing.patch b/system/xen/xsa/xsa345-0001-x86-mm-Refactor-map_pages_to_xen-to-have-only-a-sing.patch deleted file mode 100644 index d325385a56..0000000000 --- a/system/xen/xsa/xsa345-0001-x86-mm-Refactor-map_pages_to_xen-to-have-only-a-sing.patch +++ /dev/null @@ -1,94 +0,0 @@ -From b3e0d4e37b7902533a463812374947d4d6d2e463 Mon Sep 17 00:00:00 2001 -From: Wei Liu <wei.liu2@citrix.com> -Date: Sat, 11 Jan 2020 21:57:41 +0000 -Subject: [PATCH 1/3] x86/mm: Refactor map_pages_to_xen to have only a single - exit path - -We will soon need to perform clean-ups before returning. - -No functional change. - -This is part of XSA-345. - -Reported-by: Hongyan Xia <hongyxia@amazon.com> -Signed-off-by: Wei Liu <wei.liu2@citrix.com> -Signed-off-by: Hongyan Xia <hongyxia@amazon.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 17 +++++++++++------ - 1 file changed, 11 insertions(+), 6 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 30dffb68e8..133a393875 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -5187,6 +5187,7 @@ int map_pages_to_xen( - l2_pgentry_t *pl2e, ol2e; - l1_pgentry_t *pl1e, ol1e; - unsigned int i; -+ int rc = -ENOMEM; - - #define flush_flags(oldf) do { \ - unsigned int o_ = (oldf); \ -@@ -5207,7 +5208,8 @@ int map_pages_to_xen( - l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); - - if ( !pl3e ) -- return -ENOMEM; -+ goto out; -+ - ol3e = *pl3e; - - if ( cpu_has_page1gb && -@@ -5295,7 +5297,7 @@ int map_pages_to_xen( - - pl2e = alloc_xen_pagetable(); - if ( pl2e == NULL ) -- return -ENOMEM; -+ goto out; - - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - l2e_write(pl2e + i, -@@ -5324,7 +5326,7 @@ int map_pages_to_xen( - - pl2e = virt_to_xen_l2e(virt); - if ( !pl2e ) -- return -ENOMEM; -+ goto out; - - if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) & - ((1u << PAGETABLE_ORDER) - 1)) == 0) && -@@ -5367,7 +5369,7 @@ int map_pages_to_xen( - { - pl1e = virt_to_xen_l1e(virt); - if ( pl1e == NULL ) -- return -ENOMEM; -+ goto out; - } - else if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) - { -@@ -5394,7 +5396,7 @@ int map_pages_to_xen( - - pl1e = alloc_xen_pagetable(); - if ( pl1e == NULL ) -- return -ENOMEM; -+ goto out; - - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - l1e_write(&pl1e[i], -@@ -5538,7 +5540,10 @@ int map_pages_to_xen( - - #undef flush_flags - -- return 0; -+ rc = 0; -+ -+ out: -+ return rc; - } - - int populate_pt_range(unsigned long virt, unsigned long nr_mfns) --- -2.25.1 - diff --git a/system/xen/xsa/xsa345-0002-x86-mm-Refactor-modify_xen_mappings-to-have-one-exit.patch b/system/xen/xsa/xsa345-0002-x86-mm-Refactor-modify_xen_mappings-to-have-one-exit.patch deleted file mode 100644 index 836bed681a..0000000000 --- a/system/xen/xsa/xsa345-0002-x86-mm-Refactor-modify_xen_mappings-to-have-one-exit.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 9f6f35b833d295acaaa2d8ff8cf309bf688cfd50 Mon Sep 17 00:00:00 2001 -From: Wei Liu <wei.liu2@citrix.com> -Date: Sat, 11 Jan 2020 21:57:42 +0000 -Subject: [PATCH 2/3] x86/mm: Refactor modify_xen_mappings to have one exit - path - -We will soon need to perform clean-ups before returning. - -No functional change. - -This is part of XSA-345. - -Reported-by: Hongyan Xia <hongyxia@amazon.com> -Signed-off-by: Wei Liu <wei.liu2@citrix.com> -Signed-off-by: Hongyan Xia <hongyxia@amazon.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 12 +++++++++--- - 1 file changed, 9 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 133a393875..af726d3274 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -5570,6 +5570,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - l1_pgentry_t *pl1e; - unsigned int i; - unsigned long v = s; -+ int rc = -ENOMEM; - - /* Set of valid PTE bits which may be altered. */ - #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT) -@@ -5611,7 +5612,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - /* PAGE1GB: shatter the superpage and fall through. */ - pl2e = alloc_xen_pagetable(); - if ( !pl2e ) -- return -ENOMEM; -+ goto out; -+ - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - l2e_write(pl2e + i, - l2e_from_pfn(l3e_get_pfn(*pl3e) + -@@ -5666,7 +5668,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - /* PSE: shatter the superpage and try again. */ - pl1e = alloc_xen_pagetable(); - if ( !pl1e ) -- return -ENOMEM; -+ goto out; -+ - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - l1e_write(&pl1e[i], - l1e_from_pfn(l2e_get_pfn(*pl2e) + i, -@@ -5795,7 +5798,10 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - flush_area(NULL, FLUSH_TLB_GLOBAL); - - #undef FLAGS_MASK -- return 0; -+ rc = 0; -+ -+ out: -+ return rc; - } - - #undef flush_area --- -2.25.1 - diff --git a/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch b/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch deleted file mode 100644 index db407416b9..0000000000 --- a/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch +++ /dev/null @@ -1,249 +0,0 @@ -From 0ff9a8453dc47cd47eee9659d5916afb5094e871 Mon Sep 17 00:00:00 2001 -From: Hongyan Xia <hongyxia@amazon.com> -Date: Sat, 11 Jan 2020 21:57:43 +0000 -Subject: [PATCH 3/3] x86/mm: Prevent some races in hypervisor mapping updates - -map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB -superpages if possible, to maximize TLB efficiency. This means both -replacing superpage entries with smaller entries, and replacing -smaller entries with superpages. - -Unfortunately, while some potential races are handled correctly, -others are not. These include: - -1. When one processor modifies a sub-superpage mapping while another -processor replaces the entire range with a superpage. - -Take the following example: - -Suppose L3[N] points to L2. And suppose we have two processors, A and -B. - -* A walks the pagetables, get a pointer to L2. -* B replaces L3[N] with a 1GiB mapping. -* B Frees L2 -* A writes L2[M] # - -This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't -handle higher-level superpages properly: If you call virt_xen_to_l2e -on a virtual address within an L3 superpage, you'll either hit a BUG() -(most likely), or get a pointer into the middle of a data page; same -with virt_xen_to_l1 on a virtual address within either an L3 or L2 -superpage. - -So take the following example: - -* A reads pl3e and discovers it to point to an L2. -* B replaces L3[N] with a 1GiB mapping -* A calls virt_to_xen_l2e() and hits the BUG_ON() # - -2. When two processors simultaneously try to replace a sub-superpage -mapping with a superpage mapping. - -Take the following example: - -Suppose L3[N] points to L2. And suppose we have two processors, A and B, -both trying to replace L3[N] with a superpage. - -* A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2. -* B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2. -* A writes the new value into L3[N] -* B writes the new value into L3[N] -* A recursively frees all the L1's under L2, then frees L2 -* B recursively double-frees all the L1's under L2, then double-frees L2 # - -Fix this by grabbing a lock for the entirety of the mapping update -operation. - -Rather than grabbing map_pgdir_lock for the entire operation, however, -repurpose the PGT_locked bit from L3's page->type_info as a lock. -This means that rather than locking the entire address space, we -"only" lock a single 512GiB chunk of hypervisor address space at a -time. - -There was a proposal for a lock-and-reverify approach, where we walk -the pagetables to the point where we decide what to do; then grab the -map_pgdir_lock, re-verify the information we collected without the -lock, and finally make the change (starting over again if anything had -changed). Without being able to guarantee that the L2 table wasn't -freed, however, that means every read would need to be considered -potentially unsafe. Thinking carefully about that is probably -something that wants to be done on public, not under time pressure. - -This is part of XSA-345. - -Reported-by: Hongyan Xia <hongyxia@amazon.com> -Signed-off-by: Hongyan Xia <hongyxia@amazon.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 92 +++++++++++++++++++++++++++++++++++++++++++++-- - 1 file changed, 89 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index af726d3274..d6a0761f43 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2167,6 +2167,50 @@ void page_unlock(struct page_info *page) - current_locked_page_set(NULL); - } - -+/* -+ * L3 table locks: -+ * -+ * Used for serialization in map_pages_to_xen() and modify_xen_mappings(). -+ * -+ * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to -+ * reuse the PGT_locked flag. This lock is taken only when we move down to L3 -+ * tables and below, since L4 (and above, for 5-level paging) is still globally -+ * protected by map_pgdir_lock. -+ * -+ * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock(). -+ * This has two implications: -+ * - We cannot reuse reuse current_locked_page_* for debugging -+ * - To avoid the chance of deadlock, even for different pages, we -+ * must never grab page_lock() after grabbing l3t_lock(). This -+ * includes any page_lock()-based locks, such as -+ * mem_sharing_page_lock(). -+ * -+ * Also note that we grab the map_pgdir_lock while holding the -+ * l3t_lock(), so to avoid deadlock we must avoid grabbing them in -+ * reverse order. -+ */ -+static void l3t_lock(struct page_info *page) -+{ -+ unsigned long x, nx; -+ -+ do { -+ while ( (x = page->u.inuse.type_info) & PGT_locked ) -+ cpu_relax(); -+ nx = x | PGT_locked; -+ } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); -+} -+ -+static void l3t_unlock(struct page_info *page) -+{ -+ unsigned long x, nx, y = page->u.inuse.type_info; -+ -+ do { -+ x = y; -+ BUG_ON(!(x & PGT_locked)); -+ nx = x & ~PGT_locked; -+ } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); -+} -+ - #ifdef CONFIG_PV - /* - * PTE flags that a guest may change without re-validating the PTE. -@@ -5177,6 +5221,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) - flush_area_local((const void *)v, f) : \ - flush_area_all((const void *)v, f)) - -+#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR -+ -+#define L3T_LOCK(page) \ -+ do { \ -+ if ( locking ) \ -+ l3t_lock(page); \ -+ } while ( false ) -+ -+#define L3T_UNLOCK(page) \ -+ do { \ -+ if ( locking && (page) != ZERO_BLOCK_PTR ) \ -+ { \ -+ l3t_unlock(page); \ -+ (page) = ZERO_BLOCK_PTR; \ -+ } \ -+ } while ( false ) -+ - int map_pages_to_xen( - unsigned long virt, - mfn_t mfn, -@@ -5188,6 +5249,7 @@ int map_pages_to_xen( - l1_pgentry_t *pl1e, ol1e; - unsigned int i; - int rc = -ENOMEM; -+ struct page_info *current_l3page; - - #define flush_flags(oldf) do { \ - unsigned int o_ = (oldf); \ -@@ -5203,13 +5265,20 @@ int map_pages_to_xen( - } \ - } while (0) - -+ L3T_INIT(current_l3page); -+ - while ( nr_mfns != 0 ) - { -- l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); -+ l3_pgentry_t *pl3e, ol3e; - -+ L3T_UNLOCK(current_l3page); -+ -+ pl3e = virt_to_xen_l3e(virt); - if ( !pl3e ) - goto out; - -+ current_l3page = virt_to_page(pl3e); -+ L3T_LOCK(current_l3page); - ol3e = *pl3e; - - if ( cpu_has_page1gb && -@@ -5543,6 +5612,7 @@ int map_pages_to_xen( - rc = 0; - - out: -+ L3T_UNLOCK(current_l3page); - return rc; - } - -@@ -5571,6 +5641,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - unsigned int i; - unsigned long v = s; - int rc = -ENOMEM; -+ struct page_info *current_l3page; - - /* Set of valid PTE bits which may be altered. */ - #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT) -@@ -5579,11 +5650,22 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - ASSERT(IS_ALIGNED(s, PAGE_SIZE)); - ASSERT(IS_ALIGNED(e, PAGE_SIZE)); - -+ L3T_INIT(current_l3page); -+ - while ( v < e ) - { -- l3_pgentry_t *pl3e = virt_to_xen_l3e(v); -+ l3_pgentry_t *pl3e; -+ -+ L3T_UNLOCK(current_l3page); - -- if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) -+ pl3e = virt_to_xen_l3e(v); -+ if ( !pl3e ) -+ goto out; -+ -+ current_l3page = virt_to_page(pl3e); -+ L3T_LOCK(current_l3page); -+ -+ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) - { - /* Confirm the caller isn't trying to create new mappings. */ - ASSERT(!(nf & _PAGE_PRESENT)); -@@ -5801,9 +5883,13 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) - rc = 0; - - out: -+ L3T_UNLOCK(current_l3page); - return rc; - } - -+#undef L3T_LOCK -+#undef L3T_UNLOCK -+ - #undef flush_area - - int destroy_xen_mappings(unsigned long s, unsigned long e) --- -2.25.1 - diff --git a/system/xen/xsa/xsa346-4.13-1.patch b/system/xen/xsa/xsa346-4.13-1.patch deleted file mode 100644 index a32e658e80..0000000000 --- a/system/xen/xsa/xsa346-4.13-1.patch +++ /dev/null @@ -1,50 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: IOMMU: suppress "iommu_dont_flush_iotlb" when about to free a page - -Deferring flushes to a single, wide range one - as is done when -handling XENMAPSPACE_gmfn_range - is okay only as long as -pages don't get freed ahead of the eventual flush. While the only -function setting the flag (xenmem_add_to_physmap()) suggests by its name -that it's only mapping new entries, in reality the way -xenmem_add_to_physmap_one() works means an unmap would happen not only -for the page being moved (but not freed) but, if the destination GFN is -populated, also for the page being displaced from that GFN. Collapsing -the two flushes for this GFN into just one (end even more so deferring -it to a batched invocation) is not correct. - -This is part of XSA-346. - -Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Paul Durrant <paul@xen.org> -Acked-by: Julien Grall <jgrall@amazon.com> - ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -292,6 +292,7 @@ int guest_remove_page(struct domain *d, - p2m_type_t p2mt; - #endif - mfn_t mfn; -+ bool *dont_flush_p, dont_flush; - int rc; - - #ifdef CONFIG_X86 -@@ -378,8 +379,18 @@ int guest_remove_page(struct domain *d, - return -ENXIO; - } - -+ /* -+ * Since we're likely to free the page below, we need to suspend -+ * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes. -+ */ -+ dont_flush_p = &this_cpu(iommu_dont_flush_iotlb); -+ dont_flush = *dont_flush_p; -+ *dont_flush_p = false; -+ - rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0); - -+ *dont_flush_p = dont_flush; -+ - /* - * With the lack of an IOMMU on some platforms, domains with DMA-capable - * device must retrieve the same pfn when the hypercall populate_physmap diff --git a/system/xen/xsa/xsa346-4.13-2.patch b/system/xen/xsa/xsa346-4.13-2.patch deleted file mode 100644 index 6371b5c3db..0000000000 --- a/system/xen/xsa/xsa346-4.13-2.patch +++ /dev/null @@ -1,204 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: IOMMU: hold page ref until after deferred TLB flush - -When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB -flush for the "from" GFN range requires that the page remains allocated -to the guest until the TLB flush has actually occurred. Otherwise a -parallel hypercall to remove the page would only flush the TLB for the -GFN it has been moved to, but not the one is was mapped at originally. - -This is part of XSA-346. - -Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ") -Reported-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> - ---- a/xen/arch/arm/mm.c -+++ b/xen/arch/arm/mm.c -@@ -1407,7 +1407,7 @@ void share_xen_page_with_guest(struct pa - int xenmem_add_to_physmap_one( - struct domain *d, - unsigned int space, -- union xen_add_to_physmap_batch_extra extra, -+ union add_to_physmap_extra extra, - unsigned long idx, - gfn_t gfn) - { -@@ -1480,10 +1480,6 @@ int xenmem_add_to_physmap_one( - break; - } - case XENMAPSPACE_dev_mmio: -- /* extra should be 0. Reserved for future use. */ -- if ( extra.res0 ) -- return -EOPNOTSUPP; -- - rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx)); - return rc; - ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -4617,7 +4617,7 @@ static int handle_iomem_range(unsigned l - int xenmem_add_to_physmap_one( - struct domain *d, - unsigned int space, -- union xen_add_to_physmap_batch_extra extra, -+ union add_to_physmap_extra extra, - unsigned long idx, - gfn_t gpfn) - { -@@ -4701,9 +4701,20 @@ int xenmem_add_to_physmap_one( - rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K); - - put_both: -- /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */ -+ /* -+ * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. -+ * We also may need to transfer ownership of the page reference to our -+ * caller. -+ */ - if ( space == XENMAPSPACE_gmfn ) -+ { - put_gfn(d, gfn); -+ if ( !rc && extra.ppage ) -+ { -+ *extra.ppage = page; -+ page = NULL; -+ } -+ } - - if ( page ) - put_page(page); ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -814,13 +814,12 @@ int xenmem_add_to_physmap(struct domain - { - unsigned int done = 0; - long rc = 0; -- union xen_add_to_physmap_batch_extra extra; -+ union add_to_physmap_extra extra = {}; -+ struct page_info *pages[16]; - - ASSERT(paging_mode_translate(d)); - -- if ( xatp->space != XENMAPSPACE_gmfn_foreign ) -- extra.res0 = 0; -- else -+ if ( xatp->space == XENMAPSPACE_gmfn_foreign ) - extra.foreign_domid = DOMID_INVALID; - - if ( xatp->space != XENMAPSPACE_gmfn_range ) -@@ -835,7 +834,10 @@ int xenmem_add_to_physmap(struct domain - xatp->size -= start; - - if ( is_iommu_enabled(d) ) -+ { - this_cpu(iommu_dont_flush_iotlb) = 1; -+ extra.ppage = &pages[0]; -+ } - - while ( xatp->size > done ) - { -@@ -847,8 +849,12 @@ int xenmem_add_to_physmap(struct domain - xatp->idx++; - xatp->gpfn++; - -+ if ( extra.ppage ) -+ ++extra.ppage; -+ - /* Check for continuation if it's not the last iteration. */ -- if ( xatp->size > ++done && hypercall_preempt_check() ) -+ if ( (++done > ARRAY_SIZE(pages) && extra.ppage) || -+ (xatp->size > done && hypercall_preempt_check()) ) - { - rc = start + done; - break; -@@ -858,6 +864,7 @@ int xenmem_add_to_physmap(struct domain - if ( is_iommu_enabled(d) ) - { - int ret; -+ unsigned int i; - - this_cpu(iommu_dont_flush_iotlb) = 0; - -@@ -866,6 +873,15 @@ int xenmem_add_to_physmap(struct domain - if ( unlikely(ret) && rc >= 0 ) - rc = ret; - -+ /* -+ * Now that the IOMMU TLB flush was done for the original GFN, drop -+ * the page references. The 2nd flush below is fine to make later, as -+ * whoever removes the page again from its new GFN will have to do -+ * another flush anyway. -+ */ -+ for ( i = 0; i < done; ++i ) -+ put_page(pages[i]); -+ - ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done, - IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified); - if ( unlikely(ret) && rc >= 0 ) -@@ -879,6 +895,8 @@ static int xenmem_add_to_physmap_batch(s - struct xen_add_to_physmap_batch *xatpb, - unsigned int extent) - { -+ union add_to_physmap_extra extra = {}; -+ - if ( unlikely(xatpb->size < extent) ) - return -EILSEQ; - -@@ -890,6 +908,19 @@ static int xenmem_add_to_physmap_batch(s - !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) ) - return -EFAULT; - -+ switch ( xatpb->space ) -+ { -+ case XENMAPSPACE_dev_mmio: -+ /* res0 is reserved for future use. */ -+ if ( xatpb->u.res0 ) -+ return -EOPNOTSUPP; -+ break; -+ -+ case XENMAPSPACE_gmfn_foreign: -+ extra.foreign_domid = xatpb->u.foreign_domid; -+ break; -+ } -+ - while ( xatpb->size > extent ) - { - xen_ulong_t idx; -@@ -902,8 +933,7 @@ static int xenmem_add_to_physmap_batch(s - extent, 1)) ) - return -EFAULT; - -- rc = xenmem_add_to_physmap_one(d, xatpb->space, -- xatpb->u, -+ rc = xenmem_add_to_physmap_one(d, xatpb->space, extra, - idx, _gfn(gpfn)); - - if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) ) ---- a/xen/include/xen/mm.h -+++ b/xen/include/xen/mm.h -@@ -588,8 +588,22 @@ void scrub_one_page(struct page_info *); - &(d)->xenpage_list : &(d)->page_list) - #endif - -+union add_to_physmap_extra { -+ /* -+ * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs -+ * to be kept until after the flush, so the page can't get removed from -+ * the domain (and re-used for another purpose) beforehand. By passing -+ * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants -+ * to have ownership of such a reference transferred in the success case. -+ */ -+ struct page_info **ppage; -+ -+ /* XENMAPSPACE_gmfn_foreign */ -+ domid_t foreign_domid; -+}; -+ - int xenmem_add_to_physmap_one(struct domain *d, unsigned int space, -- union xen_add_to_physmap_batch_extra extra, -+ union add_to_physmap_extra extra, - unsigned long idx, gfn_t gfn); - - int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp, diff --git a/system/xen/xsa/xsa347-4.13-1.patch b/system/xen/xsa/xsa347-4.13-1.patch deleted file mode 100644 index e9f31a151f..0000000000 --- a/system/xen/xsa/xsa347-4.13-1.patch +++ /dev/null @@ -1,149 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: AMD/IOMMU: convert amd_iommu_pte from struct to union - -This is to add a "raw" counterpart to the bitfield equivalent. Take the -opportunity and - - convert fields to bool / unsigned int, - - drop the naming of the reserved field, - - shorten the names of the ignored ones. - -This is part of XSA-347. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Paul Durrant <paul@xen.org> - ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -38,7 +38,7 @@ static unsigned int pfn_to_pde_idx(unsig - static unsigned int clear_iommu_pte_present(unsigned long l1_mfn, - unsigned long dfn) - { -- struct amd_iommu_pte *table, *pte; -+ union amd_iommu_pte *table, *pte; - unsigned int flush_flags; - - table = map_domain_page(_mfn(l1_mfn)); -@@ -52,7 +52,7 @@ static unsigned int clear_iommu_pte_pres - return flush_flags; - } - --static unsigned int set_iommu_pde_present(struct amd_iommu_pte *pte, -+static unsigned int set_iommu_pde_present(union amd_iommu_pte *pte, - unsigned long next_mfn, - unsigned int next_level, bool iw, - bool ir) -@@ -87,7 +87,7 @@ static unsigned int set_iommu_pte_presen - int pde_level, - bool iw, bool ir) - { -- struct amd_iommu_pte *table, *pde; -+ union amd_iommu_pte *table, *pde; - unsigned int flush_flags; - - table = map_domain_page(_mfn(pt_mfn)); -@@ -178,7 +178,7 @@ void iommu_dte_set_guest_cr3(struct amd_ - static int iommu_pde_from_dfn(struct domain *d, unsigned long dfn, - unsigned long pt_mfn[], bool map) - { -- struct amd_iommu_pte *pde, *next_table_vaddr; -+ union amd_iommu_pte *pde, *next_table_vaddr; - unsigned long next_table_mfn; - unsigned int level; - struct page_info *table; -@@ -458,7 +458,7 @@ int __init amd_iommu_quarantine_init(str - unsigned long end_gfn = - 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT); - unsigned int level = amd_iommu_get_paging_mode(end_gfn); -- struct amd_iommu_pte *table; -+ union amd_iommu_pte *table; - - if ( hd->arch.root_table ) - { -@@ -489,7 +489,7 @@ int __init amd_iommu_quarantine_init(str - - for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) - { -- struct amd_iommu_pte *pde = &table[i]; -+ union amd_iommu_pte *pde = &table[i]; - - /* - * PDEs are essentially a subset of PTEs, so this function ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -390,7 +390,7 @@ static void deallocate_next_page_table(s - - static void deallocate_page_table(struct page_info *pg) - { -- struct amd_iommu_pte *table_vaddr; -+ union amd_iommu_pte *table_vaddr; - unsigned int index, level = PFN_ORDER(pg); - - PFN_ORDER(pg) = 0; -@@ -405,7 +405,7 @@ static void deallocate_page_table(struct - - for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) - { -- struct amd_iommu_pte *pde = &table_vaddr[index]; -+ union amd_iommu_pte *pde = &table_vaddr[index]; - - if ( pde->mfn && pde->next_level && pde->pr ) - { -@@ -557,7 +557,7 @@ static void amd_dump_p2m_table_level(str - paddr_t gpa, int indent) - { - paddr_t address; -- struct amd_iommu_pte *table_vaddr; -+ const union amd_iommu_pte *table_vaddr; - int index; - - if ( level < 1 ) -@@ -573,7 +573,7 @@ static void amd_dump_p2m_table_level(str - - for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) - { -- struct amd_iommu_pte *pde = &table_vaddr[index]; -+ const union amd_iommu_pte *pde = &table_vaddr[index]; - - if ( !(index % 2) ) - process_pending_softirqs(); ---- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h -+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h -@@ -465,20 +465,23 @@ union amd_iommu_x2apic_control { - #define IOMMU_PAGE_TABLE_U32_PER_ENTRY (IOMMU_PAGE_TABLE_ENTRY_SIZE / 4) - #define IOMMU_PAGE_TABLE_ALIGNMENT 4096 - --struct amd_iommu_pte { -- uint64_t pr:1; -- uint64_t ignored0:4; -- uint64_t a:1; -- uint64_t d:1; -- uint64_t ignored1:2; -- uint64_t next_level:3; -- uint64_t mfn:40; -- uint64_t reserved:7; -- uint64_t u:1; -- uint64_t fc:1; -- uint64_t ir:1; -- uint64_t iw:1; -- uint64_t ignored2:1; -+union amd_iommu_pte { -+ uint64_t raw; -+ struct { -+ bool pr:1; -+ unsigned int ign0:4; -+ bool a:1; -+ bool d:1; -+ unsigned int ign1:2; -+ unsigned int next_level:3; -+ uint64_t mfn:40; -+ unsigned int :7; -+ bool u:1; -+ bool fc:1; -+ bool ir:1; -+ bool iw:1; -+ unsigned int ign2:1; -+ }; - }; - - /* Paging modes */ diff --git a/system/xen/xsa/xsa347-4.13-2.patch b/system/xen/xsa/xsa347-4.13-2.patch deleted file mode 100644 index fbe7461636..0000000000 --- a/system/xen/xsa/xsa347-4.13-2.patch +++ /dev/null @@ -1,72 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: AMD/IOMMU: update live PTEs atomically - -Updating a live PTE bitfield by bitfield risks the compiler re-ordering -the individual updates as well as splitting individual updates into -multiple memory writes. Construct the new entry fully in a local -variable, do the check to determine the flushing needs on the thus -established new entry, and then write the new entry by a single insn. - -Similarly using memset() to clear a PTE is unsafe, as the order of -writes the function does is, at least in principle, undefined. - -This is part of XSA-347. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Paul Durrant <paul@xen.org> - ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -45,7 +45,7 @@ static unsigned int clear_iommu_pte_pres - pte = &table[pfn_to_pde_idx(dfn, 1)]; - - flush_flags = pte->pr ? IOMMU_FLUSHF_modified : 0; -- memset(pte, 0, sizeof(*pte)); -+ write_atomic(&pte->raw, 0); - - unmap_domain_page(table); - -@@ -57,26 +57,30 @@ static unsigned int set_iommu_pde_presen - unsigned int next_level, bool iw, - bool ir) - { -+ union amd_iommu_pte new = {}, old; - unsigned int flush_flags = IOMMU_FLUSHF_added; - -- if ( pte->pr && -- (pte->mfn != next_mfn || -- pte->iw != iw || -- pte->ir != ir || -- pte->next_level != next_level) ) -- flush_flags |= IOMMU_FLUSHF_modified; -- - /* - * FC bit should be enabled in PTE, this helps to solve potential - * issues with ATS devices - */ -- pte->fc = !next_level; -+ new.fc = !next_level; -+ -+ new.mfn = next_mfn; -+ new.iw = iw; -+ new.ir = ir; -+ new.next_level = next_level; -+ new.pr = true; -+ -+ old.raw = read_atomic(&pte->raw); -+ old.ign0 = 0; -+ old.ign1 = 0; -+ old.ign2 = 0; -+ -+ if ( old.pr && old.raw != new.raw ) -+ flush_flags |= IOMMU_FLUSHF_modified; - -- pte->mfn = next_mfn; -- pte->iw = iw; -- pte->ir = ir; -- pte->next_level = next_level; -- pte->pr = 1; -+ write_atomic(&pte->raw, new.raw); - - return flush_flags; - } diff --git a/system/xen/xsa/xsa347-4.13-3.patch b/system/xen/xsa/xsa347-4.13-3.patch deleted file mode 100644 index 90c8e66020..0000000000 --- a/system/xen/xsa/xsa347-4.13-3.patch +++ /dev/null @@ -1,59 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: AMD/IOMMU: ensure suitable ordering of DTE modifications - -DMA and interrupt translation should be enabled only after other -applicable DTE fields have been written. Similarly when disabling -translation or when moving a device between domains, translation should -first be disabled, before other entry fields get modified. Note however -that the "moving" aspect doesn't apply to the interrupt remapping side, -as domain specifics are maintained in the IRTEs here, not the DTE. We -also never disable interrupt remapping once it got enabled for a device -(the respective argument passed is always the immutable iommu_intremap). - -This is part of XSA-347. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Paul Durrant <paul@xen.org> - ---- a/xen/drivers/passthrough/amd/iommu_map.c -+++ b/xen/drivers/passthrough/amd/iommu_map.c -@@ -107,11 +107,18 @@ void amd_iommu_set_root_page_table(struc - uint64_t root_ptr, uint16_t domain_id, - uint8_t paging_mode, bool valid) - { -+ if ( valid || dte->v ) -+ { -+ dte->tv = false; -+ dte->v = true; -+ smp_wmb(); -+ } - dte->domain_id = domain_id; - dte->pt_root = paddr_to_pfn(root_ptr); - dte->iw = true; - dte->ir = true; - dte->paging_mode = paging_mode; -+ smp_wmb(); - dte->tv = true; - dte->v = valid; - } -@@ -134,6 +141,7 @@ void amd_iommu_set_intremap_table( - } - - dte->ig = false; /* unmapped interrupts result in i/o page faults */ -+ smp_wmb(); - dte->iv = valid; - } - ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -120,7 +120,10 @@ static void amd_iommu_setup_domain_devic - /* Undo what amd_iommu_disable_domain_device() may have done. */ - ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; - if ( dte->it_root ) -+ { - dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED; -+ smp_wmb(); -+ } - dte->iv = iommu_intremap; - dte->ex = ivrs_dev->dte_allow_exclusion; - dte->sys_mgt = MASK_EXTR(ivrs_dev->device_flags, ACPI_IVHD_SYSTEM_MGMT); |