Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API for creating a custom Linux kernel program #463

Open
brenns10 opened this issue Jan 22, 2025 · 2 comments
Open

API for creating a custom Linux kernel program #463

brenns10 opened this issue Jan 22, 2025 · 2 comments

Comments

@brenns10
Copy link
Contributor

brenns10 commented Jan 22, 2025

Currently, it's not documented anywhere, but the only way to create a Program that has the IS_LINUX_KERNEL flag set is to go through drgn_program_set_core_dump_fd_internal().

drgn/libdrgn/program.c

Lines 327 to 691 in 64d82dd

static struct drgn_error *
drgn_program_set_core_dump_fd_internal(struct drgn_program *prog, int fd,
const char *path)
{
struct drgn_error *err;
GElf_Ehdr ehdr_mem, *ehdr;
bool had_platform;
bool is_64_bit, little_endian, is_kdump;
size_t phnum, i;
size_t num_file_segments, j;
bool have_phys_addrs = false;
bool have_qemu_note = false;
const char *vmcoreinfo_note = NULL;
size_t vmcoreinfo_size = 0;
bool have_nt_taskstruct = false, is_proc_kcore;
bool have_vmcoreinfo = prog->vmcoreinfo.raw;
bool had_vmcoreinfo = have_vmcoreinfo;
prog->core_fd = fd;
err = has_kdump_signature(prog, path, &is_kdump);
if (err)
goto out_fd;
if (is_kdump) {
err = drgn_program_set_kdump(prog);
if (err)
goto out_fd;
return NULL;
}
elf_version(EV_CURRENT);
prog->core = elf_begin(prog->core_fd, ELF_C_READ, NULL);
if (!prog->core) {
err = drgn_error_libelf();
goto out_fd;
}
ehdr = gelf_getehdr(prog->core, &ehdr_mem);
if (!ehdr || ehdr->e_type != ET_CORE) {
err = drgn_error_format(DRGN_ERROR_INVALID_ARGUMENT,
"not an ELF core file");
goto out_elf;
}
had_platform = prog->has_platform;
if (!had_platform) {
struct drgn_platform platform;
drgn_platform_from_elf(ehdr, &platform);
drgn_program_set_platform(prog, &platform);
}
is_64_bit = ehdr->e_ident[EI_CLASS] == ELFCLASS64;
little_endian = ehdr->e_ident[EI_DATA] == ELFDATA2LSB;
if (elf_getphdrnum(prog->core, &phnum) != 0) {
err = drgn_error_libelf();
goto out_platform;
}
/*
* First pass: count the number of loadable segments, check if p_paddr
* is valid, and check for notes.
*/
num_file_segments = 0;
for (i = 0; i < phnum; i++) {
GElf_Phdr phdr_mem, *phdr;
phdr = gelf_getphdr(prog->core, i, &phdr_mem);
if (!phdr) {
err = drgn_error_libelf();
goto out_notes;
}
if (phdr->p_type == PT_LOAD) {
if (phdr->p_paddr)
have_phys_addrs = true;
num_file_segments++;
} else if (phdr->p_type == PT_NOTE) {
Elf_Data *data;
size_t offset;
GElf_Nhdr nhdr;
size_t name_offset, desc_offset;
data = elf_getdata_rawchunk(prog->core, phdr->p_offset,
phdr->p_filesz,
note_header_type(phdr->p_align));
if (!data) {
err = drgn_error_libelf();
goto out_notes;
}
offset = 0;
while (offset < data->d_size &&
(offset = gelf_getnote(data, offset, &nhdr,
&name_offset,
&desc_offset))) {
const char *name, *desc;
name = (char *)data->d_buf + name_offset;
desc = (char *)data->d_buf + desc_offset;
if (nhdr.n_namesz == sizeof("CORE") &&
memcmp(name, "CORE", sizeof("CORE")) == 0) {
if (nhdr.n_type == NT_TASKSTRUCT)
have_nt_taskstruct = true;
} else if (nhdr.n_namesz == sizeof("LINUX") &&
memcmp(name, "LINUX",
sizeof("LINUX")) == 0) {
if (nhdr.n_type == NT_ARM_PAC_MASK &&
nhdr.n_descsz >=
2 * sizeof(uint64_t)) {
memcpy(&prog->aarch64_insn_pac_mask,
(uint64_t *)desc + 1,
sizeof(uint64_t));
if (little_endian !=
HOST_LITTLE_ENDIAN)
bswap_64(prog->aarch64_insn_pac_mask);
}
} else if (nhdr.n_namesz == sizeof("VMCOREINFO") &&
memcmp(name, "VMCOREINFO",
sizeof("VMCOREINFO")) == 0) {
vmcoreinfo_note = desc;
vmcoreinfo_size = nhdr.n_descsz;
/*
* This is either a vmcore or
* /proc/kcore, so even a p_paddr of 0
* may be valid.
*/
have_phys_addrs = true;
have_vmcoreinfo = true;
} else if (nhdr.n_namesz == sizeof("QEMU") &&
memcmp(name, "QEMU",
sizeof("QEMU")) == 0) {
have_qemu_note = true;
}
}
}
}
if (have_nt_taskstruct) {
/*
* If the core file has an NT_TASKSTRUCT note and is in /proc,
* then it's probably /proc/kcore.
*/
struct statfs fs;
if (fstatfs(prog->core_fd, &fs) == -1) {
err = drgn_error_create_os("fstatfs", errno, path);
if (err)
goto out_notes;
}
is_proc_kcore = fs.f_type == 0x9fa0; /* PROC_SUPER_MAGIC */
} else {
is_proc_kcore = false;
}
if (have_vmcoreinfo && !is_proc_kcore) {
char *env;
/* Use libkdumpfile for ELF vmcores if it was requested. */
env = getenv("DRGN_USE_LIBKDUMPFILE_FOR_ELF");
if (env && atoi(env)) {
err = drgn_program_set_kdump(prog);
if (err)
goto out_notes;
return NULL;
}
}
prog->file_segments = malloc_array(num_file_segments,
sizeof(*prog->file_segments));
if (!prog->file_segments) {
err = &drgn_enomem;
goto out_notes;
}
bool pgtable_reader =
(is_proc_kcore || have_vmcoreinfo) &&
prog->platform.arch->linux_kernel_pgtable_iterator_next;
if (pgtable_reader) {
/*
* Try to read any memory that isn't in the core dump via the
* page table.
*/
err = drgn_program_add_memory_segment(prog, 0, UINT64_MAX,
read_memory_via_pgtable,
prog, false);
if (err)
goto out_segments;
}
/* Second pass: add the segments. */
for (i = 0, j = 0; i < phnum && j < num_file_segments; i++) {
GElf_Phdr phdr_mem, *phdr;
phdr = gelf_getphdr(prog->core, i, &phdr_mem);
if (!phdr) {
err = drgn_error_libelf();
goto out_segments;
}
if (phdr->p_type != PT_LOAD)
continue;
prog->file_segments[j].file_offset = phdr->p_offset;
prog->file_segments[j].file_size = phdr->p_filesz;
prog->file_segments[j].fd = prog->core_fd;
prog->file_segments[j].eio_is_fault = false;
/*
* p_filesz < p_memsz is ambiguous for core dumps. The ELF
* specification says that "if the segment's memory size p_memsz
* is larger than the file size p_filesz, the 'extra' bytes are
* defined to hold the value 0 and to follow the segment's
* initialized area."
*
* However, the Linux kernel generates userspace core dumps with
* segments with p_filesz < p_memsz to indicate that the range
* between p_filesz and p_memsz was filtered out (see
* coredump_filter in core(5)). These bytes were not necessarily
* zeroes in the process's memory, which contradicts the ELF
* specification in a way.
*
* As of Linux 5.19, /proc/kcore and /proc/vmcore never have
* segments with p_filesz < p_memsz. However, makedumpfile
* creates segments with p_filesz < p_memsz to indicate ranges
* that were excluded. This is similar to Linux userspace core
* dumps, except that makedumpfile can also exclude ranges that
* were all zeroes.
*
* So, for userspace core dumps, we want to fault for ranges
* between p_filesz and p_memsz to indicate that the memory was
* not saved rather than lying and returning zeroes. For
* /proc/kcore, we don't expect to see p_filesz < p_memsz but we
* fault to be safe. For Linux kernel core dumps, we can't
* distinguish between memory that was excluded because it was
* all zeroes and memory that was excluded by makedumpfile for
* another reason, so we're forced to always return zeroes.
*/
prog->file_segments[j].zerofill = have_vmcoreinfo && !is_proc_kcore;
err = drgn_program_add_memory_segment(prog, phdr->p_vaddr,
phdr->p_memsz,
drgn_read_memory_file,
&prog->file_segments[j],
false);
if (err)
goto out_segments;
if (have_phys_addrs &&
phdr->p_paddr != (is_64_bit ? UINT64_MAX : UINT32_MAX)) {
err = drgn_program_add_memory_segment(prog,
phdr->p_paddr,
phdr->p_memsz,
drgn_read_memory_file,
&prog->file_segments[j],
true);
if (err)
goto out_segments;
}
j++;
}
/*
* Before Linux kernel commit 464920104bf7 ("/proc/kcore: update
* physical address for kcore ram and text") (in v4.11), p_paddr in
* /proc/kcore is always zero. If we know the address of the direct
* mapping, we can still add physical segments. This needs to be a third
* pass, as we may need to read virtual memory to determine the mapping.
*/
if (is_proc_kcore && !have_phys_addrs &&
prog->platform.arch->linux_kernel_live_direct_mapping_fallback) {
uint64_t direct_mapping, direct_mapping_size;
err = prog->platform.arch->linux_kernel_live_direct_mapping_fallback(prog,
&direct_mapping,
&direct_mapping_size);
if (err)
goto out_segments;
for (i = 0, j = 0; i < phnum && j < num_file_segments; i++) {
GElf_Phdr phdr_mem, *phdr;
phdr = gelf_getphdr(prog->core, i, &phdr_mem);
if (!phdr) {
err = drgn_error_libelf();
goto out_segments;
}
if (phdr->p_type != PT_LOAD)
continue;
if (phdr->p_vaddr >= direct_mapping &&
phdr->p_vaddr - direct_mapping + phdr->p_memsz <=
direct_mapping_size) {
uint64_t phys_addr;
phys_addr = phdr->p_vaddr - direct_mapping;
err = drgn_program_add_memory_segment(prog,
phys_addr,
pgtable_reader ?
phdr->p_filesz :
phdr->p_memsz,
drgn_read_memory_file,
&prog->file_segments[j],
true);
if (err)
goto out_segments;
}
j++;
}
}
if (vmcoreinfo_note && !prog->vmcoreinfo.raw) {
err = drgn_program_parse_vmcoreinfo(prog, vmcoreinfo_note,
vmcoreinfo_size);
if (err)
goto out_segments;
}
if (is_proc_kcore) {
if (!have_vmcoreinfo) {
err = read_vmcoreinfo_fallback(prog);
if (err)
goto out_segments;
}
prog->flags |= (DRGN_PROGRAM_IS_LINUX_KERNEL |
DRGN_PROGRAM_IS_LIVE |
DRGN_PROGRAM_IS_LOCAL);
elf_end(prog->core);
prog->core = NULL;
} else if (have_vmcoreinfo) {
prog->flags |= DRGN_PROGRAM_IS_LINUX_KERNEL;
} else if (have_qemu_note) {
err = drgn_error_create(DRGN_ERROR_INVALID_ARGUMENT,
"unrecognized QEMU memory dump; "
"for Linux guests, run QEMU with '-device vmcoreinfo', "
"compile the kernel with CONFIG_FW_CFG_SYSFS and CONFIG_KEXEC, "
"and load the qemu_fw_cfg kernel module "
"before dumping the guest memory "
"(requires Linux >= 4.17 and QEMU >= 2.11)");
goto out_segments;
}
if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) {
err = drgn_program_finish_set_kernel(prog);
if (err)
goto out_segments;
}
return NULL;
out_segments:
drgn_memory_reader_deinit(&prog->reader);
drgn_memory_reader_init(&prog->reader);
free(prog->file_segments);
prog->file_segments = NULL;
out_notes:
// Reset anything we parsed from ELF notes.
prog->aarch64_insn_pac_mask = 0;
// Free vmcoreinfo buffer if it was not provided by the caller
if (!had_vmcoreinfo) {
free(prog->vmcoreinfo.raw);
memset(&prog->vmcoreinfo, 0, sizeof(prog->vmcoreinfo));
}
out_platform:
prog->has_platform = had_platform;
out_elf:
elf_end(prog->core);
prog->core = NULL;
out_fd:
close(prog->core_fd);
prog->core_fd = -1;
return err;
}

This means that it's not possible to create a Program that represents the kernel with a custom memory reader. This was the goal of #246, which enabled a really interesting use case, even if setting program flags may not be the preferred way to do so.

It seems to me that as of now, it wouldn't be terribly hard to support this. As far as I can tell, the following things would be necessary:

  1. The VMCOREINFO would need to have been provided at the creation of the program, along with the platform.
  2. A Linux kernel object finder should be created and added to the Program.
  3. A page table memory reader might need to be created and added to the Program. (Though it could be unnecessary)
  4. The IS_LINUX_KERNEL flag should be set.

With that, I believe that the next time Program.loaded_modules() is called, the kernel module iterator would be activated, and assuming the memory readers work, drgn should be able to proceed as normal.

I was thinking one interesting way to achieve this could be a Program.linux_kernel_main_module() function which does the above and then returns the kernel main module. However, I'm not sure that it's the right API, so I feel like that's an area to discuss.

@brenns10
Copy link
Contributor Author

Notes from our discussion in the January meeting:

  • This dovetails with the gdbstub support. We'll need a good way to initialize Linux kernel programs for gdbstub targets.
  • Another use case @osandov had was a script for creating a struct pt_regs from an oops message and doing some unwinding. It would have been nice to create a program, add a vmlinux, and then start using it as a kernel program without memory readers.
  • The complexity of Program initialization makes it a bit difficult to support these things. Currently a new, blank Program is technically a "userspace core dump" according to the program flags. Really, it should probably be created in some uninitialized state, but that would introduce some backward incompatibility.
  • We discussed adding a is_linux_kernel flag to the Program constructor which could set the corresponding flag, object finder, and page table memory reader.
  • We could also just reuse the vmcoreinfo argument in the program constructor as a way to signal that we should set the IS_LINUX_KERNEL flag. The downside is that, if you don't have a vmcoreinfo, you would have to forge it or pass an empty string. (We currently require that vmcoreinfo at least contain an osrelease and page size, as well as swapper_pg_dir. We may want to reduce that restriction.)
  • If we did choose to implement it in the constructor (either of the above ways), we would need to ensure that drgn_program_set_core_dump_fd_internal() doesn't duplicate any of the work we did by re-adding the object finder & memory reader.
  • If we did choose to implement it in the constructor, then we wouldn't really be solving the problem for the upcoming gdbstub interface, since ideally gdbstub could detect the vmcoreinfo and set the program as a linux kernel program.

@osandov
Copy link
Owner

osandov commented Jan 22, 2025

Thanks for the summary!

If we did choose to implement it in the constructor, then we wouldn't really be solving the problem for the upcoming gdbstub interface, since ideally gdbstub could detect the vmcoreinfo and set the program as a linux kernel program.

To clarify this point, I was more concerned about gdbstub implementations (and things like it) that don't yet have the ability to report the vmcoreinfo. In those cases, then the only way we know that we're debugging the kernel is if the user tells us (e.g., by passing a vmlinux file to -s). So the program lifetime would look something like:

  • Program is created as uninitialized.
  • Program is set to gdbstub target (or set up with some other custom memory reader), but we don't know what it is.
  • vmlinux file is provided to Program, at which point we can set up all of the Linux kernel stuff.

So we probably still need another interface, whether it's Program.linux_kernel_main_module() or something else.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants