Commit 9941e245 authored by Lorenzo "Palinuro" Faletra's avatar Lorenzo "Palinuro" Faletra
Browse files

Import Upstream version 5.10.28

parent 4e026225
Pipeline #2889 failed with stages
......@@ -37,4 +37,11 @@
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
#define ARCH_ADD_TRAMP_KEY(name) \
asm(".pushsection .static_call_tramp_key, \"a\" \n" \
".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \
".long " STATIC_CALL_KEY_STR(name) " - . \n" \
".popsection \n")
#endif /* _ASM_STATIC_CALL_H */
......@@ -216,10 +216,31 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
/*
* Thread-synchronous status.
*
* This is different from the flags in that nobody else
* ever touches our thread-synchronous status, so we don't
* have to worry about atomic accesses.
*/
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
#ifndef __ASSEMBLY__
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
#define TS_COMPAT_RESTART 0x0008
#define arch_set_restart_data arch_set_restart_data
static inline void arch_set_restart_data(struct restart_block *restart)
{
struct thread_info *ti = current_thread_info();
if (ti->status & TS_COMPAT)
ti->status |= TS_COMPAT_RESTART;
else
ti->status &= ~TS_COMPAT_RESTART;
}
#endif
#ifndef __ASSEMBLY__
#ifdef CONFIG_X86_32
#define in_ia32_syscall() true
......
......@@ -86,18 +86,6 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
}
#endif
/*
* The maximum amount of extra memory compared to the base size. The
* main scaling factor is the size of struct page. At extreme ratios
* of base:extra, all the base memory can be filled with page
* structures for the extra memory, leaving no space for anything
* else.
*
* 10x seems like a reasonable balance between scaling flexibility and
* leaving a practically usable system.
*/
#define XEN_EXTRA_MEM_RATIO (10)
/*
* Helper functions to write or read unsigned long values to/from
* memory, when the access may fault.
......
......@@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void)
/*
* Initialize the ACPI boot-time table parser.
*/
if (acpi_table_init()) {
if (acpi_locate_initial_tables())
disable_acpi();
return;
}
else
acpi_reserve_initial_tables();
}
int __init early_acpi_boot_init(void)
{
if (acpi_disabled)
return 1;
acpi_table_init_complete();
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
......@@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void)
} else {
printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
disable_acpi();
return;
return 1;
}
}
}
int __init early_acpi_boot_init(void)
{
/*
* If acpi_disabled, bail out
*/
if (acpi_disabled)
return 1;
/*
* Process the Multiple APIC Description Table (MADT), if present
......
......@@ -2317,6 +2317,11 @@ static int cpuid_to_apicid[] = {
[0 ... NR_CPUS - 1] = -1,
};
bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
return phys_id == cpuid_to_apicid[cpu];
}
#ifdef CONFIG_SMP
/**
* apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
......
......@@ -1033,6 +1033,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
legacy = mp_is_legacy_irq(irq);
/*
* IRQ2 is unusable for historical reasons on systems which
* have a legacy PIC. See the comment vs. IRQ2 further down.
*
* If this gets removed at some point then the related code
* in lapic_assign_system_vectors() needs to be adjusted as
* well.
*/
if (legacy && irq == PIC_CASCADE_IR)
return -EINVAL;
}
mutex_lock(&ioapic_mutex);
......
......@@ -1051,6 +1051,9 @@ void __init setup_arch(char **cmdline_p)
cleanup_highmap();
/* Look for ACPI tables and reserve memory occupied by them. */
acpi_boot_table_init();
memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
......@@ -1136,11 +1139,6 @@ void __init setup_arch(char **cmdline_p)
early_platform_quirks();
/*
* Parse the ACPI tables for possible boot-time SMP configuration.
*/
acpi_boot_table_init();
early_acpi_boot_init();
initmem_init();
......
......@@ -766,30 +766,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
{
/*
* This function is fundamentally broken as currently
* implemented.
*
* The idea is that we want to trigger a call to the
* restart_block() syscall and that we want in_ia32_syscall(),
* in_x32_syscall(), etc. to match whatever they were in the
* syscall being restarted. We assume that the syscall
* instruction at (regs->ip - 2) matches whatever syscall
* instruction we used to enter in the first place.
*
* The problem is that we can get here when ptrace pokes
* syscall-like values into regs even if we're not in a syscall
* at all.
*
* For now, we maintain historical behavior and guess based on
* stored state. We could do better by saving the actual
* syscall arch in restart_block or (with caveats on x32) by
* checking if regs->ip points to 'int $0x80'. The current
* behavior is incorrect if a tracer has a different bitness
* than the tracee.
*/
#ifdef CONFIG_IA32_EMULATION
if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
if (current_thread_info()->status & TS_COMPAT_RESTART)
return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
......
......@@ -1655,7 +1655,7 @@ void play_dead_common(void)
local_irq_disable();
}
static bool wakeup_cpu0(void)
bool wakeup_cpu0(void)
{
if (smp_processor_id() == 0 && enable_start_cpu0)
return true;
......
......@@ -246,11 +246,18 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
return true;
}
static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
bool vmcb12_lma;
/*
* FIXME: these should be done after copying the fields,
* to avoid TOC/TOU races. For these save area checks
* the possible damage is limited since kvm_set_cr0 and
* kvm_set_cr4 handle failure; EFER_SVME is an exception
* so it is force-set later in nested_prepare_vmcb_save.
*/
if ((vmcb12->save.efer & EFER_SVME) == 0)
return false;
......@@ -271,7 +278,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
return false;
return nested_vmcb_check_controls(&vmcb12->control);
return true;
}
static void load_nested_vmcb_control(struct vcpu_svm *svm,
......@@ -396,7 +403,14 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.gdtr = vmcb12->save.gdtr;
svm->vmcb->save.idtr = vmcb12->save.idtr;
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags);
svm_set_efer(&svm->vcpu, vmcb12->save.efer);
/*
* Force-set EFER_SVME even though it is checked earlier on the
* VMCB12, because the guest can flip the bit between the check
* and now. Clearing EFER_SVME would call svm_free_nested.
*/
svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME);
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
......@@ -454,7 +468,6 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
int ret;
svm->nested.vmcb12_gpa = vmcb12_gpa;
load_nested_vmcb_control(svm, &vmcb12->control);
nested_prepare_vmcb_save(svm, vmcb12);
nested_prepare_vmcb_control(svm);
......@@ -501,7 +514,10 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
if (WARN_ON_ONCE(!svm->nested.initialized))
return -EINVAL;
if (!nested_vmcb_checks(svm, vmcb12)) {
load_nested_vmcb_control(svm, &vmcb12->control);
if (!nested_vmcb_check_save(svm, vmcb12) ||
!nested_vmcb_check_controls(&svm->nested.ctl)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
......@@ -1205,6 +1221,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
*/
if (!(save->cr0 & X86_CR0_PG))
goto out_free;
if (!(save->efer & EFER_SVME))
goto out_free;
/*
* All checks done, we can enter guest mode. L1 control fields
......
......@@ -1505,35 +1505,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
{
struct kvm_x86_msr_filter *msr_filter;
struct msr_bitmap_range *ranges;
struct kvm *kvm = vcpu->kvm;
struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
u32 count = kvm->arch.msr_filter.count;
u32 i;
bool r = kvm->arch.msr_filter.default_allow;
bool allowed;
int idx;
u32 i;
/* MSR filtering not set up or x2APIC enabled, allow everything */
if (!count || (index >= 0x800 && index <= 0x8ff))
/* x2APIC MSRs do not support filtering. */
if (index >= 0x800 && index <= 0x8ff)
return true;
/* Prevent collision with set_msr_filter */
idx = srcu_read_lock(&kvm->srcu);
for (i = 0; i < count; i++) {
msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
if (!msr_filter) {
allowed = true;
goto out;
}
allowed = msr_filter->default_allow;
ranges = msr_filter->ranges;
for (i = 0; i < msr_filter->count; i++) {
u32 start = ranges[i].base;
u32 end = start + ranges[i].nmsrs;
u32 flags = ranges[i].flags;
unsigned long *bitmap = ranges[i].bitmap;
if ((index >= start) && (index < end) && (flags & type)) {
r = !!test_bit(index - start, bitmap);
allowed = !!test_bit(index - start, bitmap);
break;
}
}
out:
srcu_read_unlock(&kvm->srcu, idx);
return r;
return allowed;
}
EXPORT_SYMBOL_GPL(kvm_msr_allowed);
......@@ -5291,25 +5300,34 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return r;
}
static void kvm_clear_msr_filter(struct kvm *kvm)
static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
{
struct kvm_x86_msr_filter *msr_filter;
msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
if (!msr_filter)
return NULL;
msr_filter->default_allow = default_allow;
return msr_filter;
}
static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
{
u32 i;
u32 count = kvm->arch.msr_filter.count;
struct msr_bitmap_range ranges[16];
mutex_lock(&kvm->lock);
kvm->arch.msr_filter.count = 0;
memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
mutex_unlock(&kvm->lock);
synchronize_srcu(&kvm->srcu);
if (!msr_filter)
return;
for (i = 0; i < msr_filter->count; i++)
kfree(msr_filter->ranges[i].bitmap);
for (i = 0; i < count; i++)
kfree(ranges[i].bitmap);
kfree(msr_filter);
}
static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
struct kvm_msr_filter_range *user_range)
{
struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
struct msr_bitmap_range range;
unsigned long *bitmap = NULL;
size_t bitmap_size;
......@@ -5343,11 +5361,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user
goto err;
}
/* Everything ok, add this range identifier to our global pool */
ranges[kvm->arch.msr_filter.count] = range;
/* Make sure we filled the array before we tell anyone to walk it */
smp_wmb();
kvm->arch.msr_filter.count++;
/* Everything ok, add this range identifier. */
msr_filter->ranges[msr_filter->count] = range;
msr_filter->count++;
return 0;
err:
......@@ -5358,10 +5374,11 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_msr_filter __user *user_msr_filter = argp;
struct kvm_x86_msr_filter *new_filter, *old_filter;
struct kvm_msr_filter filter;
bool default_allow;
int r = 0;
bool empty = true;
int r = 0;
u32 i;
if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
......@@ -5374,25 +5391,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
if (empty && !default_allow)
return -EINVAL;
kvm_clear_msr_filter(kvm);
kvm->arch.msr_filter.default_allow = default_allow;
new_filter = kvm_alloc_msr_filter(default_allow);
if (!new_filter)
return -ENOMEM;
/*
* Protect from concurrent calls to this function that could trigger
* a TOCTOU violation on kvm->arch.msr_filter.count.
*/
mutex_lock(&kvm->lock);
for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
if (r)
break;
r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
if (r) {
kvm_free_msr_filter(new_filter);
return r;
}
}
mutex_lock(&kvm->lock);
/* The per-VM filter is protected by kvm->lock... */
old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
synchronize_srcu(&kvm->srcu);
kvm_free_msr_filter(old_filter);
kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
mutex_unlock(&kvm->lock);
return r;
return 0;
}
long kvm_arch_vm_ioctl(struct file *filp,
......@@ -10423,8 +10447,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
u32 i;
if (current->mm == kvm->mm) {
/*
* Free memory regions allocated on behalf of userspace,
......@@ -10441,8 +10463,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
if (kvm_x86_ops.vm_destroy)
kvm_x86_ops.vm_destroy(kvm);
for (i = 0; i < kvm->arch.msr_filter.count; i++)
kfree(kvm->arch.msr_filter.ranges[i].bitmap);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
......
......@@ -231,7 +231,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
if (pgprot_val(old_prot) == pgprot_val(new_prot))
return;
pa = pfn << page_level_shift(level);
pa = pfn << PAGE_SHIFT;
size = page_level_size(level);
/*
......
......@@ -1735,7 +1735,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
* add rsp, 8 // skip eth_type_trans's frame
* ret // return to its caller
*/
int arch_prepare_bpf_trampoline(void *image, void *image_end,
int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_progs *tprogs,
void *orig_call)
......@@ -1774,6 +1774,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
save_regs(m, &prog, nr_args, stack_size);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
if (emit_call(&prog, __bpf_tramp_enter, prog)) {
ret = -EINVAL;
goto cleanup;
}
}
if (fentry->nr_progs)
if (invoke_bpf(m, &prog, fentry, stack_size))
return -EINVAL;
......@@ -1792,8 +1801,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
if (fentry->nr_progs || fmod_ret->nr_progs)
restore_regs(m, &prog, nr_args, stack_size);
restore_regs(m, &prog, nr_args, stack_size);
/* call original function */
if (emit_call(&prog, orig_call, prog)) {
......@@ -1802,6 +1810,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
}
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
im->ip_after_call = prog;
memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
}
if (fmod_ret->nr_progs) {
......@@ -1832,9 +1843,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
* the return value is only updated on the stack and still needs to be
* restored to R0.
*/
if (flags & BPF_TRAMP_F_CALL_ORIG)
if (flags & BPF_TRAMP_F_CALL_ORIG) {
im->ip_epilogue = prog;
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
if (emit_call(&prog, __bpf_tramp_exit, prog)) {
ret = -EINVAL;
goto cleanup;
}
/* restore original return value back into RAX */
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
}
EMIT1(0x5B); /* pop rbx */
EMIT1(0xC9); /* leave */
......
......@@ -98,8 +98,8 @@ EXPORT_SYMBOL_GPL(xen_p2m_size);
unsigned long xen_max_p2m_pfn __read_mostly;
EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
#ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT
#define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT
#else
#define P2M_LIMIT 0
#endif
......@@ -416,9 +416,6 @@ void __init xen_vmalloc_p2m_tree(void)
xen_p2m_last_pfn = xen_max_p2m_pfn;
p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
if (!p2m_limit && IS_ENABLED(CONFIG_XEN_UNPOPULATED_ALLOC))
p2m_limit = xen_start_info->nr_pages * XEN_EXTRA_MEM_RATIO;
vm.flags = VM_ALLOC;
vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
PMD_SIZE * PMDS_PER_MID_PAGE);
......
......@@ -59,6 +59,18 @@ static struct {
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
/*
* The maximum amount of extra memory compared to the base size. The
* main scaling factor is the size of struct page. At extreme ratios
* of base:extra, all the base memory can be filled with page
* structures for the extra memory, leaving no space for anything
* else.
*
* 10x seems like a reasonable balance between scaling flexibility and
* leaving a practically usable system.
*/
#define EXTRA_MEM_RATIO (10)
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
......@@ -778,13 +790,13 @@ char * __init xen_memory_setup(void)
extra_pages += max_pages - max_pfn;
/*
* Clamp the amount of extra memory to a XEN_EXTRA_MEM_RATIO
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
* factor the base size.
*
* Make sure we have no memory above max_pages, as this area
* isn't handled by the p2m management.
*/
extra_pages = min3(XEN_EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages, max_pages - max_pfn);
i = 0;
addr = xen_e820_table.entries[0].addr;
......
......@@ -99,37 +99,6 @@
LOAD_CP_REGS_TAB(6)
LOAD_CP_REGS_TAB(7)
/*
* coprocessor_flush(struct thread_info*, index)
* a2 a3
*
* Save coprocessor registers for coprocessor 'index'.
* The register values are saved to or loaded from the coprocessor area