From dd057d6856ece8be4e6358b9f46c6c110b70a218 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 01/10] libdrgn: shrink symbol binding & kind enums By using __attribute__((__packed__)), we shrink each enum from the default integer size of four bytes, down to the minimum size of one. This reduces the size of drgn_symbol from 32 bytes down to 26, with 6 bytes of padding. It doesn't have a practical benefit yet, but adding fields to struct drgn_symbol in the future may not increase the size. Signed-off-by: Stephen Brennan --- libdrgn/drgn.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdrgn/drgn.h b/libdrgn/drgn.h index f223acc62..8d372ef7e 100644 --- a/libdrgn/drgn.h +++ b/libdrgn/drgn.h @@ -2882,7 +2882,7 @@ enum drgn_symbol_binding { DRGN_SYMBOL_BINDING_GLOBAL, DRGN_SYMBOL_BINDING_WEAK, DRGN_SYMBOL_BINDING_UNIQUE = 11, /* STB_GNU_UNIQUE + 1 */ -}; +} __attribute__((__packed__)); /** Kind of entity represented by a symbol. */ enum drgn_symbol_kind { @@ -2898,7 +2898,7 @@ enum drgn_symbol_kind { DRGN_SYMBOL_KIND_COMMON, DRGN_SYMBOL_KIND_TLS, DRGN_SYMBOL_KIND_IFUNC = 10, /* STT_GNU_IFUNC */ -}; +} __attribute__((__packed__)); /** Destroy a @ref drgn_symbol. */ void drgn_symbol_destroy(struct drgn_symbol *sym); From 66044392452016d3669e2cbacf9c9d3eb28c30d6 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 02/10] libdrgn: introduce Symbol Finder API Symbol lookup is not yet modular, like type or object lookup. However, making it modular would enable easier development and prototyping of alternative Symbol providers, such as Linux kernel module symbol tables, vmlinux kallsyms tables, and BPF function symbols. To begin with, create a modular Symbol API within libdrgn, and refactor the ELF symbol search to use it. For now, we leave drgn_program_find_symbol_by_address_internal() alone. Its conversion will require some surgery, since the new API can return errors, whereas this function cannot. Signed-off-by: Stephen Brennan --- libdrgn/debug_info.c | 2 + libdrgn/debug_info.h | 2 + libdrgn/drgn.h | 67 ++++++++ libdrgn/program.c | 370 +++++++++++++++++++++++++------------------ libdrgn/program.h | 13 ++ libdrgn/symbol.c | 58 +++++++ libdrgn/symbol.h | 33 ++++ 7 files changed, 394 insertions(+), 151 deletions(-) diff --git a/libdrgn/debug_info.c b/libdrgn/debug_info.c index ff30b55af..0eb75104e 100644 --- a/libdrgn/debug_info.c +++ b/libdrgn/debug_info.c @@ -2044,6 +2044,8 @@ void drgn_debug_info_init(struct drgn_debug_info *dbinfo, drgn_program_add_object_finder_impl(prog, &dbinfo->object_finder, drgn_debug_info_find_object, dbinfo); + drgn_program_add_symbol_finder_impl(prog, &dbinfo->symbol_finder, + elf_symbols_search, prog); drgn_module_table_init(&dbinfo->modules); c_string_set_init(&dbinfo->module_names); drgn_dwarf_info_init(dbinfo); diff --git a/libdrgn/debug_info.h b/libdrgn/debug_info.h index 64e8bb863..0b689106b 100644 --- a/libdrgn/debug_info.h +++ b/libdrgn/debug_info.h @@ -23,6 +23,7 @@ #include "object_index.h" #include "orc_info.h" #include "string_builder.h" +#include "symbol.h" #include "type.h" #include "vector.h" @@ -137,6 +138,7 @@ struct drgn_debug_info { struct drgn_type_finder type_finder; struct drgn_object_finder object_finder; + struct drgn_symbol_finder symbol_finder; /** DWARF frontend library handle. */ Dwfl *dwfl; diff --git a/libdrgn/drgn.h b/libdrgn/drgn.h index 8d372ef7e..b042b5659 100644 --- a/libdrgn/drgn.h +++ b/libdrgn/drgn.h @@ -930,6 +930,73 @@ struct drgn_error *drgn_program_find_symbols_by_address(struct drgn_program *pro struct drgn_symbol ***syms_ret, size_t *count_ret); +/** Flags for @ref drgn_symbol_find_fn() */ +enum drgn_find_symbol_flags { + /** Find symbols whose name matches the name argument */ + DRGN_FIND_SYMBOL_NAME = 1 << 0, + /** Find symbols whose address matches the addr argument */ + DRGN_FIND_SYMBOL_ADDR = 1 << 1, + /** Find only one symbol */ + DRGN_FIND_SYMBOL_ONE = 1 << 2, +}; + +/** Result builder for @ref drgn_symbol_find_fn() */ +struct drgn_symbol_result_builder; + +/** + * Add or set the return value for a symbol search + * + * Symbol finders should call this with each symbol search result. If the symbol + * search was @ref DRGN_FIND_SYMBOL_ONE, then only the most recent symbol added + * to the builder will be returned. Otherwise, all symbols added to the builder + * are returned. Returns true on success, or false on an allocation failure. + */ +bool +drgn_symbol_result_builder_add(struct drgn_symbol_result_builder *builder, + struct drgn_symbol *symbol); + +/** Get the current number of results in a symbol search result. */ +size_t drgn_symbol_result_builder_count(const struct drgn_symbol_result_builder *builder); + +/** + * Callback for finding one or more symbols. + * + * The callback should perform a symbol lookup based on the flags given in @a + * flags. When multiple flags are provided, the effect should be treated as a + * logical AND. Symbol results should be added to the result builder @a builder, + * via @ref drgn_symbol_result_builder_add(). When @ref DRGN_FIND_SYMBOL_ONE is + * set, then the finding function should only return the single best symbol + * result, and short-circuit return. + * + * When no symbol is found, simply do not add any result to the builder. No + * error should be returned in this case. + * + * @param[in] name Name of the symbol to match + * @param[in] addr Address of the symbol to match + * @param[in] flags Flags indicating the desired behavior of the search + * @param[in] arg Argument passed to @ref drgn_program_add_symbol_finder(). + * @param[in] builder Used to build the resulting symbol output + */ +typedef struct drgn_error * +(*drgn_symbol_find_fn)(const char *name, uint64_t addr, + enum drgn_find_symbol_flags flags, void *arg, + struct drgn_symbol_result_builder *builder); + +/** + * Register a symbol finding callback. + * + * Callbacks are called in reverse order that they were originally added. In + * case of a search for multiple symbols, then the results of all callbacks are + * concatenated. If the search is for a single symbol, then the first callback + * which finds a symbol will short-circuit the search. + * + * @param[in] fn Symbol search function + * @param[in] arg Argument to pass to the callback + */ +struct drgn_error * +drgn_program_add_symbol_finder(struct drgn_program *prog, + drgn_symbol_find_fn fn, void *arg); + /** Element type and size. */ struct drgn_element_info { /** Type of the element. */ diff --git a/libdrgn/program.c b/libdrgn/program.c index db2ff07d7..8224fd2da 100644 --- a/libdrgn/program.c +++ b/libdrgn/program.c @@ -112,6 +112,17 @@ void drgn_program_init(struct drgn_program *prog, drgn_object_init(&prog->vmemmap, prog); } +static void drgn_program_deinit_symbol_finders(struct drgn_program *prog) +{ + struct drgn_symbol_finder *finder = prog->symbol_finders; + while (finder) { + struct drgn_symbol_finder *next = finder->next; + if (finder->free) + free(finder); + finder = next; + } +} + void drgn_program_deinit(struct drgn_program *prog) { if (prog->core_dump_notes_cached) { @@ -135,6 +146,7 @@ void drgn_program_deinit(struct drgn_program *prog) drgn_object_deinit(&prog->vmemmap); drgn_object_index_deinit(&prog->oindex); + drgn_program_deinit_symbol_finders(prog); drgn_program_deinit_types(prog); drgn_memory_reader_deinit(&prog->reader); @@ -1798,57 +1810,83 @@ struct drgn_error *drgn_error_symbol_not_found(uint64_t address) address); } -LIBDRGN_PUBLIC struct drgn_error * -drgn_program_find_symbol_by_address(struct drgn_program *prog, uint64_t address, - struct drgn_symbol **ret) -{ - struct drgn_symbol *sym; - - sym = malloc(sizeof(*sym)); - if (!sym) - return &drgn_enomem; - if (!drgn_program_find_symbol_by_address_internal(prog, address, NULL, - sym)) { - free(sym); - return drgn_error_symbol_not_found(address); - } - *ret = sym; - return NULL; -} - -DEFINE_VECTOR(symbolp_vector, struct drgn_symbol *); - -enum { - SYMBOLS_SEARCH_NAME = (1 << 0), - SYMBOLS_SEARCH_ADDRESS = (1 << 1), - SYMBOLS_SEARCH_ALL = (1 << 2), -}; - -struct symbols_search_arg { +struct elf_symbols_search_arg { const char *name; uint64_t address; - struct symbolp_vector results; - unsigned int flags; + enum drgn_find_symbol_flags flags; + struct drgn_error *err; + struct drgn_symbol_result_builder *builder; }; -static bool symbol_match(struct symbols_search_arg *arg, GElf_Addr addr, +static bool elf_symbol_match(struct elf_symbols_search_arg *arg, GElf_Addr addr, const GElf_Sym *sym, const char *name) { - if (arg->flags & SYMBOLS_SEARCH_ALL) - return true; - if ((arg->flags & SYMBOLS_SEARCH_NAME) && strcmp(name, arg->name) == 0) - return true; - if ((arg->flags & SYMBOLS_SEARCH_ADDRESS) && - arg->address >= addr && arg->address < addr + sym->st_size) - return true; - return false; + if ((arg->flags & DRGN_FIND_SYMBOL_NAME) && strcmp(name, arg->name) != 0) + return false; + if ((arg->flags & DRGN_FIND_SYMBOL_ADDR) && + (arg->address < addr || arg->address >= addr + sym->st_size)) + return false; + return true; } -static int symbols_search_cb(Dwfl_Module *dwfl_module, void **userdatap, +static bool elf_symbol_store_match(struct elf_symbols_search_arg *arg, + GElf_Sym *elf_sym, GElf_Addr addr, + const char *name) +{ + struct drgn_symbol *sym; + if (arg->flags == (DRGN_FIND_SYMBOL_ONE | DRGN_FIND_SYMBOL_NAME)) { + int binding = GELF_ST_BIND(elf_sym->st_info); + /* + * The order of precedence is + * GLOBAL = UNIQUE > WEAK > LOCAL = everything else + * + * If we found a global or unique symbol, return it + * immediately. If we found a weak symbol, then save it, + * which may overwrite a previously found weak or local + * symbol. Otherwise, save the symbol only if we haven't + * found another symbol. + */ + if (binding != STB_GLOBAL + && binding != STB_GNU_UNIQUE + && binding != STB_WEAK + && drgn_symbol_result_builder_count(arg->builder) > 0) + return false; + sym = malloc(sizeof(*sym)); + if (!sym) { + arg->err = &drgn_enomem; + return true; + } + drgn_symbol_from_elf(name, addr, elf_sym, sym); + if (!drgn_symbol_result_builder_add(arg->builder, sym)) { + arg->err = &drgn_enomem; + drgn_symbol_destroy(sym); + } + + /* Abort on error, or short-circuit if we found a global or + * unique symbol */ + return (arg->err || sym->binding == DRGN_SYMBOL_BINDING_GLOBAL + || sym->binding == DRGN_SYMBOL_BINDING_UNIQUE); + } else { + sym = malloc(sizeof(*sym)); + if (!sym) { + arg->err = &drgn_enomem; + return true; + } + drgn_symbol_from_elf(name, addr, elf_sym, sym); + if (!drgn_symbol_result_builder_add(arg->builder, sym)) { + arg->err = &drgn_enomem; + drgn_symbol_destroy(sym); + } + /* Abort on error, or short-circuit for single lookup */ + return (arg->err || (arg->flags & DRGN_FIND_SYMBOL_ONE)); + } +} + +static int elf_symbols_search_cb(Dwfl_Module *dwfl_module, void **userdatap, const char *module_name, Dwarf_Addr base, void *cb_arg) { - struct symbols_search_arg *arg = cb_arg; + struct elf_symbols_search_arg *arg = cb_arg; int symtab_len = dwfl_module_getsymtab(dwfl_module); if (symtab_len == -1) @@ -1861,54 +1899,103 @@ static int symbols_search_cb(Dwfl_Module *dwfl_module, void **userdatap, const char *name = dwfl_module_getsym_info(dwfl_module, i, &elf_sym, &elf_addr, NULL, NULL, NULL); - if (!name || !symbol_match(arg, elf_addr, &elf_sym, name)) + if (!name || !elf_symbol_match(arg, elf_addr, &elf_sym, name)) continue; + if (elf_symbol_store_match(arg, &elf_sym, elf_addr, name)) + return DWARF_CB_ABORT; + } + return DWARF_CB_OK; +} + +struct drgn_error * +elf_symbols_search(const char *name, uint64_t addr, enum drgn_find_symbol_flags flags, + void *data, struct drgn_symbol_result_builder *builder) +{ + Dwfl_Module *dwfl_module = NULL; + struct drgn_program *prog = data; + struct elf_symbols_search_arg arg = { + .name = name, + .address = addr, + .flags = flags, + .err = NULL, + .builder = builder, + }; + if (arg.flags & DRGN_FIND_SYMBOL_ADDR) { + dwfl_module = dwfl_addrmodule(prog->dbinfo.dwfl, arg.address); + if (!dwfl_module) + return NULL; + } + + if ((arg.flags & (DRGN_FIND_SYMBOL_ADDR | DRGN_FIND_SYMBOL_ONE)) + == (DRGN_FIND_SYMBOL_ADDR | DRGN_FIND_SYMBOL_ONE)) { + GElf_Off offset; + GElf_Sym elf_sym; + const char *name = dwfl_module_addrinfo( + dwfl_module, addr, &offset, + &elf_sym, NULL, NULL, NULL); + if (!name) + return NULL; struct drgn_symbol *sym = malloc(sizeof(*sym)); if (!sym) - return DWARF_CB_ABORT; - drgn_symbol_from_elf(name, elf_addr, &elf_sym, sym); - if (!symbolp_vector_append(&arg->results, &sym)) { + return &drgn_enomem; + drgn_symbol_from_elf(name, addr - offset, &elf_sym, sym); + if (!drgn_symbol_result_builder_add(builder, sym)) { + arg.err = &drgn_enomem; drgn_symbol_destroy(sym); - return DWARF_CB_ABORT; } + } else if (dwfl_module) { + elf_symbols_search_cb(dwfl_module, NULL, NULL, 0, &arg); + } else { + dwfl_getmodules(prog->dbinfo.dwfl, elf_symbols_search_cb, &arg, 0); } - return DWARF_CB_OK; + return arg.err; } static struct drgn_error * -symbols_search(struct drgn_program *prog, struct symbols_search_arg *arg, - struct drgn_symbol ***syms_ret, size_t *count_ret) +drgn_program_symbols_search(struct drgn_program *prog, const char *name, + uint64_t addr, enum drgn_find_symbol_flags flags, + struct drgn_symbol_result_builder *builder) { - struct drgn_error *err; - - symbolp_vector_init(&arg->results); - - /* - * When searching for addresses, we can identify the exact module to - * search. Otherwise we need to fall back to an exhaustive search. - */ - err = NULL; - if (arg->flags & SYMBOLS_SEARCH_ADDRESS) { - Dwfl_Module *module = dwfl_addrmodule(prog->dbinfo.dwfl, - arg->address); - if (module && symbols_search_cb(module, NULL, NULL, 0, arg)) - err = &drgn_enomem; - } else { - if (dwfl_getmodules(prog->dbinfo.dwfl, symbols_search_cb, arg, - 0)) - err = &drgn_enomem; + struct drgn_error *err = NULL; + struct drgn_symbol_finder *finder = prog->symbol_finders; + while (finder) { + err = finder->fn(name, addr, flags, finder->arg, builder); + if (err || + ((flags & DRGN_FIND_SYMBOL_ONE) + && drgn_symbol_result_builder_count(builder) > 0)) + break; + finder = finder->next; } + return err; +} - if (err) { - vector_for_each(symbolp_vector, symbolp, &arg->results) - drgn_symbol_destroy(*symbolp); - symbolp_vector_deinit(&arg->results); +struct drgn_error * +drgn_program_add_symbol_finder_impl(struct drgn_program *prog, + struct drgn_symbol_finder *finder, + drgn_symbol_find_fn fn, void *arg) +{ + if (finder) { + finder->free = false; } else { - symbolp_vector_shrink_to_fit(&arg->results); - symbolp_vector_steal(&arg->results, syms_ret, count_ret); + finder = malloc(sizeof(*finder)); + if (!finder) + return &drgn_enomem; + finder->free = true; } - return err; + finder->fn = fn; + finder->arg = arg; + finder->next = prog->symbol_finders; + prog->symbol_finders = finder; + return NULL; +} + +LIBDRGN_PUBLIC struct drgn_error * +drgn_program_add_symbol_finder(struct drgn_program *prog, + drgn_symbol_find_fn fn, + void *arg) +{ + return drgn_program_add_symbol_finder_impl(prog, NULL, fn, arg); } LIBDRGN_PUBLIC struct drgn_error * @@ -1916,11 +2003,17 @@ drgn_program_find_symbols_by_name(struct drgn_program *prog, const char *name, struct drgn_symbol ***syms_ret, size_t *count_ret) { - struct symbols_search_arg arg = { - .name = name, - .flags = name ? SYMBOLS_SEARCH_NAME : SYMBOLS_SEARCH_ALL, - }; - return symbols_search(prog, &arg, syms_ret, count_ret); + struct drgn_symbol_result_builder builder; + enum drgn_find_symbol_flags flags = name ? DRGN_FIND_SYMBOL_NAME : 0; + + drgn_symbol_result_builder_init(&builder, false); + struct drgn_error *err = drgn_program_symbols_search(prog, name, 0, + flags, &builder); + if (err) + drgn_symbol_result_builder_abort(&builder); + else + drgn_symbol_result_builder_array(&builder, syms_ret, count_ret); + return err; } LIBDRGN_PUBLIC struct drgn_error * @@ -1929,88 +2022,63 @@ drgn_program_find_symbols_by_address(struct drgn_program *prog, struct drgn_symbol ***syms_ret, size_t *count_ret) { - struct symbols_search_arg arg = { - .address = address, - .flags = SYMBOLS_SEARCH_ADDRESS, - }; - return symbols_search(prog, &arg, syms_ret, count_ret); -} + struct drgn_symbol_result_builder builder; + enum drgn_find_symbol_flags flags = DRGN_FIND_SYMBOL_ADDR; -struct find_symbol_by_name_arg { - const char *name; - GElf_Sym sym; - GElf_Addr addr; - bool found; - bool bad_symtabs; -}; + drgn_symbol_result_builder_init(&builder, false); + struct drgn_error *err = drgn_program_symbols_search(prog, NULL, address, + flags, &builder); + if (err) + drgn_symbol_result_builder_abort(&builder); + else + drgn_symbol_result_builder_array(&builder, syms_ret, count_ret); + return err; +} -static int find_symbol_by_name_cb(Dwfl_Module *dwfl_module, void **userdatap, - const char *module_name, Dwarf_Addr base, - void *cb_arg) +LIBDRGN_PUBLIC struct drgn_error * +drgn_program_find_symbol_by_name(struct drgn_program *prog, + const char *name, struct drgn_symbol **ret) { - struct find_symbol_by_name_arg *arg = cb_arg; - int symtab_len = dwfl_module_getsymtab(dwfl_module); - if (symtab_len == -1) { - arg->bad_symtabs = true; - return DWARF_CB_OK; - } - /* - * Global symbols are after local symbols, so by iterating backwards we - * might find a global symbol faster. Ignore the zeroth null symbol. - */ - for (int i = symtab_len - 1; i > 0; i--) { - GElf_Sym sym; - GElf_Addr addr; - const char *name = dwfl_module_getsym_info(dwfl_module, i, &sym, - &addr, NULL, NULL, - NULL); - if (name && strcmp(arg->name, name) == 0) { - /* - * The order of precedence is - * GLOBAL = GNU_UNIQUE > WEAK > LOCAL = everything else - * - * If we found a global or unique symbol, return it - * immediately. If we found a weak symbol, then save it, - * which may overwrite a previously found weak or local - * symbol. Otherwise, save the symbol only if we haven't - * found another symbol. - */ - if (GELF_ST_BIND(sym.st_info) == STB_GLOBAL || - GELF_ST_BIND(sym.st_info) == STB_GNU_UNIQUE || - GELF_ST_BIND(sym.st_info) == STB_WEAK || - !arg->found) { - arg->sym = sym; - arg->addr = addr; - arg->found = true; - } - if (GELF_ST_BIND(sym.st_info) == STB_GLOBAL || - GELF_ST_BIND(sym.st_info) == STB_GNU_UNIQUE) - return DWARF_CB_ABORT; - } + struct drgn_symbol_result_builder builder; + enum drgn_find_symbol_flags flags = DRGN_FIND_SYMBOL_NAME | DRGN_FIND_SYMBOL_ONE; + + drgn_symbol_result_builder_init(&builder, true); + struct drgn_error *err = drgn_program_symbols_search(prog, name, 0, + flags, &builder); + if (err) { + drgn_symbol_result_builder_abort(&builder); + return err; } - return DWARF_CB_OK; + + if (!drgn_symbol_result_builder_count(&builder)) + return drgn_error_format(DRGN_ERROR_LOOKUP, + "could not find symbol with name '%s'", name); + + *ret = drgn_symbol_result_builder_single(&builder); + return err; } LIBDRGN_PUBLIC struct drgn_error * -drgn_program_find_symbol_by_name(struct drgn_program *prog, - const char *name, struct drgn_symbol **ret) +drgn_program_find_symbol_by_address(struct drgn_program *prog, uint64_t address, + struct drgn_symbol **ret) { - struct find_symbol_by_name_arg arg = { - .name = name, - }; - dwfl_getmodules(prog->dbinfo.dwfl, find_symbol_by_name_cb, &arg, 0); - if (arg.found) { - struct drgn_symbol *sym = malloc(sizeof(*sym)); - if (!sym) - return &drgn_enomem; - drgn_symbol_from_elf(name, arg.addr, &arg.sym, sym); - *ret = sym; - return NULL; + struct drgn_symbol_result_builder builder; + enum drgn_find_symbol_flags flags = DRGN_FIND_SYMBOL_ADDR | DRGN_FIND_SYMBOL_ONE; + + drgn_symbol_result_builder_init(&builder, true); + struct drgn_error *err = drgn_program_symbols_search(prog, NULL, address, + flags, &builder); + + if (err) { + drgn_symbol_result_builder_abort(&builder); + return err; } - return drgn_error_format(DRGN_ERROR_LOOKUP, - "could not find symbol with name '%s'%s", name, - arg.bad_symtabs ? - " (could not get some symbol tables)" : ""); + + if (!drgn_symbol_result_builder_count(&builder)) + return drgn_error_symbol_not_found(address); + + *ret = drgn_symbol_result_builder_single(&builder); + return err; } LIBDRGN_PUBLIC struct drgn_error * diff --git a/libdrgn/program.h b/libdrgn/program.h index 8b30e052f..7e997f167 100644 --- a/libdrgn/program.h +++ b/libdrgn/program.h @@ -27,6 +27,7 @@ #include "object_index.h" #include "platform.h" #include "pp.h" +#include "symbol.h" #include "type.h" #include "vector.h" @@ -109,6 +110,7 @@ struct drgn_program { */ struct drgn_object_index oindex; struct drgn_debug_info dbinfo; + struct drgn_symbol_finder *symbol_finders; /* * Program information. @@ -364,6 +366,17 @@ bool drgn_program_find_symbol_by_address_internal(struct drgn_program *prog, Dwfl_Module *module, struct drgn_symbol *ret); +/* + * Implementation of the Symbol finder API, based on ELF symbols + */ +struct drgn_error * +elf_symbols_search(const char *name, uint64_t addr, enum drgn_find_symbol_flags flags, + void *data, struct drgn_symbol_result_builder *builder); + +struct drgn_error * +drgn_program_add_symbol_finder_impl(struct drgn_program *prog, + struct drgn_symbol_finder *finder, + drgn_symbol_find_fn fn, void *arg); /** * Call before a blocking (I/O or long-running) operation. * diff --git a/libdrgn/symbol.c b/libdrgn/symbol.c index 89c92532b..e6097b442 100644 --- a/libdrgn/symbol.c +++ b/libdrgn/symbol.c @@ -73,3 +73,61 @@ LIBDRGN_PUBLIC bool drgn_symbol_eq(struct drgn_symbol *a, struct drgn_symbol *b) a->size == b->size && a->binding == b->binding && a->kind == b->kind); } + +DEFINE_VECTOR_FUNCTIONS(symbolp_vector); + +LIBDRGN_PUBLIC bool +drgn_symbol_result_builder_add(struct drgn_symbol_result_builder *builder, + struct drgn_symbol *symbol) +{ + if (builder->one) { + if (builder->single) + drgn_symbol_destroy(builder->single); + builder->single = symbol; + } else if (!symbolp_vector_append(&builder->vector, &symbol)) { + return false; + } + return true; +} + +LIBDRGN_PUBLIC size_t +drgn_symbol_result_builder_count(const struct drgn_symbol_result_builder *builder) +{ + if (builder->one) + return builder->single ? 1 : 0; + else + return symbolp_vector_size(&builder->vector); +} + +void drgn_symbol_result_builder_init(struct drgn_symbol_result_builder *builder, + bool one) +{ + memset(builder, 0, sizeof(*builder)); + builder->one = one; + if (!one) + symbolp_vector_init(&builder->vector); +} + +void drgn_symbol_result_builder_abort(struct drgn_symbol_result_builder *builder) +{ + if (builder->one) { + drgn_symbol_destroy(builder->single); + } else { + vector_for_each(symbolp_vector, symbolp, &builder->vector) + drgn_symbol_destroy(*symbolp); + symbolp_vector_deinit(&builder->vector); + } +} + +struct drgn_symbol * +drgn_symbol_result_builder_single(struct drgn_symbol_result_builder *builder) +{ + return builder->single; +} + +void drgn_symbol_result_builder_array(struct drgn_symbol_result_builder *builder, + struct drgn_symbol ***syms_ret, size_t *count_ret) +{ + symbolp_vector_shrink_to_fit(&builder->vector); + symbolp_vector_steal(&builder->vector, syms_ret, count_ret); +} diff --git a/libdrgn/symbol.h b/libdrgn/symbol.h index e136e86ee..4241a3ef3 100644 --- a/libdrgn/symbol.h +++ b/libdrgn/symbol.h @@ -7,6 +7,7 @@ #include #include "drgn.h" +#include "vector.h" struct drgn_symbol { const char *name; @@ -16,8 +17,40 @@ struct drgn_symbol { enum drgn_symbol_kind kind; }; +struct drgn_symbol_finder { + drgn_symbol_find_fn fn; + void *arg; + struct drgn_symbol_finder *next; + bool free; +}; + +DEFINE_VECTOR_TYPE(symbolp_vector, struct drgn_symbol *); + +struct drgn_symbol_result_builder { + bool one; + union { + struct symbolp_vector vector; + struct drgn_symbol *single; + }; +}; + /** Initialize a @ref drgn_symbol from an ELF symbol. */ void drgn_symbol_from_elf(const char *name, uint64_t address, const GElf_Sym *elf_sym, struct drgn_symbol *ret); +/** Destroy the contents of the result builder */ +void drgn_symbol_result_builder_abort(struct drgn_symbol_result_builder *builder); + +/** Initialize result builder */ +void drgn_symbol_result_builder_init(struct drgn_symbol_result_builder *builder, + bool one); + +/** Return single result */ +struct drgn_symbol * +drgn_symbol_result_builder_single(struct drgn_symbol_result_builder *builder); + +/** Return array result */ +void drgn_symbol_result_builder_array(struct drgn_symbol_result_builder *builder, + struct drgn_symbol ***syms_ret, size_t *count_ret); + #endif /* DRGN_SYMBOL_H */ From 70cd097fe579bf23128b228596791c9cc4389d4f Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 03/10] libdrgn: move find_symbol_by_address_internal The following commit will modify it to use drgn_program_symbols_search(), a static function declared below. Move it underneath in preparation. No changes to the function. Signed-off-by: Stephen Brennan --- libdrgn/program.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/libdrgn/program.c b/libdrgn/program.c index 8224fd2da..ce87d2fe2 100644 --- a/libdrgn/program.c +++ b/libdrgn/program.c @@ -1782,27 +1782,6 @@ drgn_program_find_object(struct drgn_program *prog, const char *name, ret); } -bool drgn_program_find_symbol_by_address_internal(struct drgn_program *prog, - uint64_t address, - Dwfl_Module *module, - struct drgn_symbol *ret) -{ - if (!module) { - module = dwfl_addrmodule(prog->dbinfo.dwfl, address); - if (!module) - return false; - } - - GElf_Off offset; - GElf_Sym elf_sym; - const char *name = dwfl_module_addrinfo(module, address, &offset, - &elf_sym, NULL, NULL, NULL); - if (!name) - return false; - drgn_symbol_from_elf(name, address - offset, &elf_sym, ret); - return true; -} - struct drgn_error *drgn_error_symbol_not_found(uint64_t address) { return drgn_error_format(DRGN_ERROR_LOOKUP, @@ -2081,6 +2060,27 @@ drgn_program_find_symbol_by_address(struct drgn_program *prog, uint64_t address, return err; } +bool drgn_program_find_symbol_by_address_internal(struct drgn_program *prog, + uint64_t address, + Dwfl_Module *module, + struct drgn_symbol *ret) +{ + if (!module) { + module = dwfl_addrmodule(prog->dbinfo.dwfl, address); + if (!module) + return false; + } + + GElf_Off offset; + GElf_Sym elf_sym; + const char *name = dwfl_module_addrinfo(module, address, &offset, + &elf_sym, NULL, NULL, NULL); + if (!name) + return false; + drgn_symbol_from_elf(name, address - offset, &elf_sym, ret); + return true; +} + LIBDRGN_PUBLIC struct drgn_error * drgn_program_element_info(struct drgn_program *prog, struct drgn_type *type, struct drgn_element_info *ret) From 944488a21ca2f855ceff7d0ad687501a00e6e35e Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 04/10] libdrgn: Use Symbol Finder API in find_symbol_by_address_internal() The drgn_program_find_symbol_by_address_internal() function is used when libdrgn itself may want to lookup a symbol: in particular, when formatting stack traces or objects. It does less work by possibly already having a Dwfl_Module looked up, and by avoiding memory allocation of a symbol, and it's more convenient because it doesn't return any errors, including on lookup failure. Unfortunately, the new symbol finder API breaks all of these properties: the returned symbol is now allocated via malloc() which needs cleanup on error, and errors can be returned by any finder via the lookup API. What's more, the finder API doesn't allow specifying an already-known module. Thankfully, error handling can be improved using the cleanup API, and looking up a module for an address is usually a reasonably cheap binary tree operation. Switch the internal method over to the new finder API. The major difference now is simply that lookup failures don't result in an error: they simply result in a NULL symbol. Signed-off-by: Stephen Brennan --- libdrgn/language_c.c | 22 ++++++++-------- libdrgn/program.c | 33 +++++++++++------------- libdrgn/program.h | 19 +++++++------- libdrgn/stack_trace.c | 60 +++++++++++++++++++++---------------------- libdrgn/symbol.h | 7 +++++ 5 files changed, 70 insertions(+), 71 deletions(-) diff --git a/libdrgn/language_c.c b/libdrgn/language_c.c index bd36b773f..4d0f27455 100644 --- a/libdrgn/language_c.c +++ b/libdrgn/language_c.c @@ -1261,9 +1261,8 @@ c_format_pointer_object(const struct drgn_object *obj, bool c_string = ((flags & DRGN_FORMAT_OBJECT_STRING) && is_character_type(drgn_type_type(underlying_type).type)); - bool have_symbol; uint64_t uvalue; - struct drgn_symbol sym; + _cleanup_symbol_ struct drgn_symbol *sym = NULL; size_t start, type_start, type_end, value_start, value_end; start = sb->len; @@ -1287,18 +1286,17 @@ c_format_pointer_object(const struct drgn_object *obj, if (err) return err; - have_symbol = ((flags & DRGN_FORMAT_OBJECT_SYMBOLIZE) && - drgn_program_find_symbol_by_address_internal(drgn_object_program(obj), - uvalue, - NULL, - &sym)); - if (have_symbol && dereference && !c_string && + if ((flags & DRGN_FORMAT_OBJECT_SYMBOLIZE) && + (err = drgn_program_find_symbol_by_address_internal(drgn_object_program(obj), + uvalue, &sym))) + return err; + if (sym && dereference && !c_string && !string_builder_appendc(sb, '(')) return &drgn_enomem; value_start = sb->len; - if (have_symbol && - !string_builder_appendf(sb, "%s+0x%" PRIx64 " = ", sym.name, - uvalue - sym.address)) + if (sym && + !string_builder_appendf(sb, "%s+0x%" PRIx64 " = ", sym->name, + uvalue - sym->address)) return &drgn_enomem; if (!string_builder_appendf(sb, "0x%" PRIx64, uvalue)) @@ -1307,7 +1305,7 @@ c_format_pointer_object(const struct drgn_object *obj, return NULL; value_end = sb->len; - if ((have_symbol && dereference && !c_string && + if ((sym && dereference && !c_string && !string_builder_appendc(sb, ')')) || !string_builder_append(sb, " = ")) return &drgn_enomem; diff --git a/libdrgn/program.c b/libdrgn/program.c index ce87d2fe2..0a09c13d7 100644 --- a/libdrgn/program.c +++ b/libdrgn/program.c @@ -2060,25 +2060,22 @@ drgn_program_find_symbol_by_address(struct drgn_program *prog, uint64_t address, return err; } -bool drgn_program_find_symbol_by_address_internal(struct drgn_program *prog, - uint64_t address, - Dwfl_Module *module, - struct drgn_symbol *ret) -{ - if (!module) { - module = dwfl_addrmodule(prog->dbinfo.dwfl, address); - if (!module) - return false; - } +struct drgn_error * +drgn_program_find_symbol_by_address_internal(struct drgn_program *prog, + uint64_t address, + struct drgn_symbol **ret) +{ + struct drgn_symbol_result_builder builder; + enum drgn_find_symbol_flags flags = DRGN_FIND_SYMBOL_ADDR | DRGN_FIND_SYMBOL_ONE; - GElf_Off offset; - GElf_Sym elf_sym; - const char *name = dwfl_module_addrinfo(module, address, &offset, - &elf_sym, NULL, NULL, NULL); - if (!name) - return false; - drgn_symbol_from_elf(name, address - offset, &elf_sym, ret); - return true; + drgn_symbol_result_builder_init(&builder, true); + struct drgn_error *err = drgn_program_symbols_search(prog, NULL, address, + flags, &builder); + if (err) + drgn_symbol_result_builder_abort(&builder); + else + *ret = drgn_symbol_result_builder_single(&builder); + return err; } LIBDRGN_PUBLIC struct drgn_error * diff --git a/libdrgn/program.h b/libdrgn/program.h index 7e997f167..636ee2445 100644 --- a/libdrgn/program.h +++ b/libdrgn/program.h @@ -353,18 +353,17 @@ struct drgn_error *drgn_program_cache_prstatus_entry(struct drgn_program *prog, uint32_t *ret); /* - * Like @ref drgn_program_find_symbol_by_address(), but @p ret is already - * allocated, we may already know the module, and doesn't return a @ref - * drgn_error. + * Like @ref drgn_program_find_symbol_by_address(), but returns @c NULL rather + * than a lookup error if the symbol was not found. * - * @param[in] module Module containing the address. May be @c NULL, in which - * case this will look it up. - * @return Whether the symbol was found. + * @param[in] address Address to search for. + * @param [out] ret The symbol found by the lookup (if found) + * @return @c NULL unless an error (unrelated to a lookup error) was encountered */ -bool drgn_program_find_symbol_by_address_internal(struct drgn_program *prog, - uint64_t address, - Dwfl_Module *module, - struct drgn_symbol *ret); +struct drgn_error * +drgn_program_find_symbol_by_address_internal(struct drgn_program *prog, + uint64_t address, + struct drgn_symbol **ret); /* * Implementation of the Symbol finder API, based on ELF symbols diff --git a/libdrgn/stack_trace.c b/libdrgn/stack_trace.c index 75f85036f..7cb2032f6 100644 --- a/libdrgn/stack_trace.c +++ b/libdrgn/stack_trace.c @@ -109,6 +109,7 @@ drgn_stack_trace_num_frames(struct drgn_stack_trace *trace) LIBDRGN_PUBLIC struct drgn_error * drgn_format_stack_trace(struct drgn_stack_trace *trace, char **ret) { + struct drgn_error *err; STRING_BUILDER(str); for (size_t frame = 0; frame < trace->num_frames; frame++) { if (!string_builder_appendf(&str, "#%-2zu ", frame)) @@ -121,19 +122,19 @@ drgn_format_stack_trace(struct drgn_stack_trace *trace, char **ret) if (!string_builder_append(&str, name)) return &drgn_enomem; } else if ((pc = drgn_register_state_get_pc(regs)).has_value) { - Dwfl_Module *dwfl_module = - regs->module ? regs->module->dwfl_module : NULL; - struct drgn_symbol sym; - if (dwfl_module && - drgn_program_find_symbol_by_address_internal(trace->prog, - pc.value - !regs->interrupted, - dwfl_module, - &sym)) { + _cleanup_symbol_ struct drgn_symbol *sym = NULL; + err = drgn_program_find_symbol_by_address_internal(trace->prog, + pc.value - !regs->interrupted, + &sym); + if (err) + return err; + + if (sym) { if (!string_builder_appendf(&str, "%s+0x%" PRIx64 "/0x%" PRIx64, - sym.name, - pc.value - sym.address, - sym.size)) + sym->name, + pc.value - sym->address, + sym->size)) return &drgn_enomem; } else { if (!string_builder_appendf(&str, "0x%" PRIx64, @@ -173,6 +174,7 @@ drgn_format_stack_frame(struct drgn_stack_trace *trace, size_t frame, char **ret { STRING_BUILDER(str); struct drgn_register_state *regs = trace->frames[frame].regs; + struct drgn_error *err; if (!string_builder_appendf(&str, "#%zu at ", frame)) return &drgn_enomem; @@ -181,17 +183,15 @@ drgn_format_stack_frame(struct drgn_stack_trace *trace, size_t frame, char **ret if (!string_builder_appendf(&str, "%#" PRIx64, pc.value)) return &drgn_enomem; - Dwfl_Module *dwfl_module = - regs->module ? regs->module->dwfl_module : NULL; - struct drgn_symbol sym; - if (dwfl_module && - drgn_program_find_symbol_by_address_internal(trace->prog, - pc.value - !regs->interrupted, - dwfl_module, - &sym) && - !string_builder_appendf(&str, " (%s+0x%" PRIx64 "/0x%" PRIx64 ")", - sym.name, pc.value - sym.address, - sym.size)) + _cleanup_symbol_ struct drgn_symbol *sym; + err = drgn_program_find_symbol_by_address_internal(trace->prog, + pc.value - !regs->interrupted, + &sym); + if (err) + return err; + if (sym && !string_builder_appendf(&str, " (%s+0x%" PRIx64 "/0x%" PRIx64 ")", + sym->name, pc.value - sym->address, + sym->size)) return &drgn_enomem; } else { if (!string_builder_append(&str, "???")) @@ -368,17 +368,15 @@ drgn_stack_frame_symbol(struct drgn_stack_trace *trace, size_t frame, "program counter is not known at stack frame"); } pc.value -= !regs->interrupted; - Dwfl_Module *dwfl_module = - regs->module ? regs->module->dwfl_module : NULL; - if (!dwfl_module) - return drgn_error_symbol_not_found(pc.value); - _cleanup_free_ struct drgn_symbol *sym = malloc(sizeof(*sym)); + struct drgn_symbol *sym = NULL; + struct drgn_error *err; + err = drgn_program_find_symbol_by_address_internal(trace->prog, pc.value, + &sym); + if (err) + return err; if (!sym) - return &drgn_enomem; - if (!drgn_program_find_symbol_by_address_internal(trace->prog, pc.value, - dwfl_module, sym)) return drgn_error_symbol_not_found(pc.value); - *ret = no_cleanup_ptr(sym); + *ret = sym; return NULL; } diff --git a/libdrgn/symbol.h b/libdrgn/symbol.h index 4241a3ef3..495566307 100644 --- a/libdrgn/symbol.h +++ b/libdrgn/symbol.h @@ -6,6 +6,7 @@ #include +#include "cleanup.h" #include "drgn.h" #include "vector.h" @@ -34,6 +35,12 @@ struct drgn_symbol_result_builder { }; }; +#define _cleanup_symbol_ _cleanup_(drgn_symbol_cleanup) +static inline void drgn_symbol_cleanup(struct drgn_symbol **p) +{ + drgn_symbol_destroy(*p); +} + /** Initialize a @ref drgn_symbol from an ELF symbol. */ void drgn_symbol_from_elf(const char *name, uint64_t address, const GElf_Sym *elf_sym, struct drgn_symbol *ret); From f824149155e7a358c3e63057064a130430fc167d Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 05/10] libdrgn: move elf_symbols_search to debug_info.c Now that the symbol finder API is created, we can move the ELF symbol implementation into the debug_info.c file, where it more logically belongs. The only change to these functions in the move is to declare elf_symbols_search as static. Signed-off-by: Stephen Brennan --- libdrgn/debug_info.c | 142 +++++++++++++++++++++++++++++++++++++++++++ libdrgn/program.c | 142 ------------------------------------------- libdrgn/program.h | 7 --- 3 files changed, 142 insertions(+), 149 deletions(-) diff --git a/libdrgn/debug_info.c b/libdrgn/debug_info.c index 0eb75104e..44f1cd343 100644 --- a/libdrgn/debug_info.c +++ b/libdrgn/debug_info.c @@ -2024,6 +2024,148 @@ struct drgn_error *drgn_debug_info_load(struct drgn_debug_info *dbinfo, goto out; } +struct elf_symbols_search_arg { + const char *name; + uint64_t address; + enum drgn_find_symbol_flags flags; + struct drgn_error *err; + struct drgn_symbol_result_builder *builder; +}; + +static bool elf_symbol_match(struct elf_symbols_search_arg *arg, GElf_Addr addr, + const GElf_Sym *sym, const char *name) +{ + if ((arg->flags & DRGN_FIND_SYMBOL_NAME) && strcmp(name, arg->name) != 0) + return false; + if ((arg->flags & DRGN_FIND_SYMBOL_ADDR) && + (arg->address < addr || arg->address >= addr + sym->st_size)) + return false; + return true; +} + +static bool elf_symbol_store_match(struct elf_symbols_search_arg *arg, + GElf_Sym *elf_sym, GElf_Addr addr, + const char *name) +{ + struct drgn_symbol *sym; + if (arg->flags == (DRGN_FIND_SYMBOL_ONE | DRGN_FIND_SYMBOL_NAME)) { + int binding = GELF_ST_BIND(elf_sym->st_info); + /* + * The order of precedence is + * GLOBAL = UNIQUE > WEAK > LOCAL = everything else + * + * If we found a global or unique symbol, return it + * immediately. If we found a weak symbol, then save it, + * which may overwrite a previously found weak or local + * symbol. Otherwise, save the symbol only if we haven't + * found another symbol. + */ + if (binding != STB_GLOBAL + && binding != STB_GNU_UNIQUE + && binding != STB_WEAK + && drgn_symbol_result_builder_count(arg->builder) > 0) + return false; + sym = malloc(sizeof(*sym)); + if (!sym) { + arg->err = &drgn_enomem; + return true; + } + drgn_symbol_from_elf(name, addr, elf_sym, sym); + if (!drgn_symbol_result_builder_add(arg->builder, sym)) { + arg->err = &drgn_enomem; + drgn_symbol_destroy(sym); + } + + /* Abort on error, or short-circuit if we found a global or + * unique symbol */ + return (arg->err || sym->binding == DRGN_SYMBOL_BINDING_GLOBAL + || sym->binding == DRGN_SYMBOL_BINDING_UNIQUE); + } else { + sym = malloc(sizeof(*sym)); + if (!sym) { + arg->err = &drgn_enomem; + return true; + } + drgn_symbol_from_elf(name, addr, elf_sym, sym); + if (!drgn_symbol_result_builder_add(arg->builder, sym)) { + arg->err = &drgn_enomem; + drgn_symbol_destroy(sym); + } + /* Abort on error, or short-circuit for single lookup */ + return (arg->err || (arg->flags & DRGN_FIND_SYMBOL_ONE)); + } +} + +static int elf_symbols_search_cb(Dwfl_Module *dwfl_module, void **userdatap, + const char *module_name, Dwarf_Addr base, + void *cb_arg) +{ + struct elf_symbols_search_arg *arg = cb_arg; + + int symtab_len = dwfl_module_getsymtab(dwfl_module); + if (symtab_len == -1) + return DWARF_CB_OK; + + /* Ignore the zeroth null symbol */ + for (int i = 1; i < symtab_len; i++) { + GElf_Sym elf_sym; + GElf_Addr elf_addr; + const char *name = dwfl_module_getsym_info(dwfl_module, i, + &elf_sym, &elf_addr, + NULL, NULL, NULL); + if (!name || !elf_symbol_match(arg, elf_addr, &elf_sym, name)) + continue; + if (elf_symbol_store_match(arg, &elf_sym, elf_addr, name)) + return DWARF_CB_ABORT; + } + return DWARF_CB_OK; +} + +static struct drgn_error * +elf_symbols_search(const char *name, uint64_t addr, enum drgn_find_symbol_flags flags, + void *data, struct drgn_symbol_result_builder *builder) +{ + Dwfl_Module *dwfl_module = NULL; + struct drgn_program *prog = data; + struct elf_symbols_search_arg arg = { + .name = name, + .address = addr, + .flags = flags, + .err = NULL, + .builder = builder, + }; + + if (arg.flags & DRGN_FIND_SYMBOL_ADDR) { + dwfl_module = dwfl_addrmodule(prog->dbinfo.dwfl, arg.address); + if (!dwfl_module) + return NULL; + } + + if ((arg.flags & (DRGN_FIND_SYMBOL_ADDR | DRGN_FIND_SYMBOL_ONE)) + == (DRGN_FIND_SYMBOL_ADDR | DRGN_FIND_SYMBOL_ONE)) { + GElf_Off offset; + GElf_Sym elf_sym; + const char *name = dwfl_module_addrinfo( + dwfl_module, addr, &offset, + &elf_sym, NULL, NULL, NULL); + if (!name) + return NULL; + struct drgn_symbol *sym = malloc(sizeof(*sym)); + if (!sym) + return &drgn_enomem; + drgn_symbol_from_elf(name, addr - offset, &elf_sym, sym); + if (!drgn_symbol_result_builder_add(builder, sym)) { + arg.err = &drgn_enomem; + drgn_symbol_destroy(sym); + } + } else if (dwfl_module) { + elf_symbols_search_cb(dwfl_module, NULL, NULL, 0, &arg); + } else { + dwfl_getmodules(prog->dbinfo.dwfl, elf_symbols_search_cb, &arg, 0); + } + return arg.err; +} + bool drgn_debug_info_is_indexed(struct drgn_debug_info *dbinfo, const char *name) { diff --git a/libdrgn/program.c b/libdrgn/program.c index 0a09c13d7..1f1d8a4f2 100644 --- a/libdrgn/program.c +++ b/libdrgn/program.c @@ -1789,148 +1789,6 @@ struct drgn_error *drgn_error_symbol_not_found(uint64_t address) address); } -struct elf_symbols_search_arg { - const char *name; - uint64_t address; - enum drgn_find_symbol_flags flags; - struct drgn_error *err; - struct drgn_symbol_result_builder *builder; -}; - -static bool elf_symbol_match(struct elf_symbols_search_arg *arg, GElf_Addr addr, - const GElf_Sym *sym, const char *name) -{ - if ((arg->flags & DRGN_FIND_SYMBOL_NAME) && strcmp(name, arg->name) != 0) - return false; - if ((arg->flags & DRGN_FIND_SYMBOL_ADDR) && - (arg->address < addr || arg->address >= addr + sym->st_size)) - return false; - return true; -} - -static bool elf_symbol_store_match(struct elf_symbols_search_arg *arg, - GElf_Sym *elf_sym, GElf_Addr addr, - const char *name) -{ - struct drgn_symbol *sym; - if (arg->flags == (DRGN_FIND_SYMBOL_ONE | DRGN_FIND_SYMBOL_NAME)) { - int binding = GELF_ST_BIND(elf_sym->st_info); - /* - * The order of precedence is - * GLOBAL = UNIQUE > WEAK > LOCAL = everything else - * - * If we found a global or unique symbol, return it - * immediately. If we found a weak symbol, then save it, - * which may overwrite a previously found weak or local - * symbol. Otherwise, save the symbol only if we haven't - * found another symbol. - */ - if (binding != STB_GLOBAL - && binding != STB_GNU_UNIQUE - && binding != STB_WEAK - && drgn_symbol_result_builder_count(arg->builder) > 0) - return false; - sym = malloc(sizeof(*sym)); - if (!sym) { - arg->err = &drgn_enomem; - return true; - } - drgn_symbol_from_elf(name, addr, elf_sym, sym); - if (!drgn_symbol_result_builder_add(arg->builder, sym)) { - arg->err = &drgn_enomem; - drgn_symbol_destroy(sym); - } - - /* Abort on error, or short-circuit if we found a global or - * unique symbol */ - return (arg->err || sym->binding == DRGN_SYMBOL_BINDING_GLOBAL - || sym->binding == DRGN_SYMBOL_BINDING_UNIQUE); - } else { - sym = malloc(sizeof(*sym)); - if (!sym) { - arg->err = &drgn_enomem; - return true; - } - drgn_symbol_from_elf(name, addr, elf_sym, sym); - if (!drgn_symbol_result_builder_add(arg->builder, sym)) { - arg->err = &drgn_enomem; - drgn_symbol_destroy(sym); - } - /* Abort on error, or short-circuit for single lookup */ - return (arg->err || (arg->flags & DRGN_FIND_SYMBOL_ONE)); - } -} - -static int elf_symbols_search_cb(Dwfl_Module *dwfl_module, void **userdatap, - const char *module_name, Dwarf_Addr base, - void *cb_arg) -{ - struct elf_symbols_search_arg *arg = cb_arg; - - int symtab_len = dwfl_module_getsymtab(dwfl_module); - if (symtab_len == -1) - return DWARF_CB_OK; - - /* Ignore the zeroth null symbol */ - for (int i = 1; i < symtab_len; i++) { - GElf_Sym elf_sym; - GElf_Addr elf_addr; - const char *name = dwfl_module_getsym_info(dwfl_module, i, - &elf_sym, &elf_addr, - NULL, NULL, NULL); - if (!name || !elf_symbol_match(arg, elf_addr, &elf_sym, name)) - continue; - if (elf_symbol_store_match(arg, &elf_sym, elf_addr, name)) - return DWARF_CB_ABORT; - } - return DWARF_CB_OK; -} - -struct drgn_error * -elf_symbols_search(const char *name, uint64_t addr, enum drgn_find_symbol_flags flags, - void *data, struct drgn_symbol_result_builder *builder) -{ - Dwfl_Module *dwfl_module = NULL; - struct drgn_program *prog = data; - struct elf_symbols_search_arg arg = { - .name = name, - .address = addr, - .flags = flags, - .err = NULL, - .builder = builder, - }; - - if (arg.flags & DRGN_FIND_SYMBOL_ADDR) { - dwfl_module = dwfl_addrmodule(prog->dbinfo.dwfl, arg.address); - if (!dwfl_module) - return NULL; - } - - if ((arg.flags & (DRGN_FIND_SYMBOL_ADDR | DRGN_FIND_SYMBOL_ONE)) - == (DRGN_FIND_SYMBOL_ADDR | DRGN_FIND_SYMBOL_ONE)) { - GElf_Off offset; - GElf_Sym elf_sym; - const char *name = dwfl_module_addrinfo( - dwfl_module, addr, &offset, - &elf_sym, NULL, NULL, NULL); - if (!name) - return NULL; - struct drgn_symbol *sym = malloc(sizeof(*sym)); - if (!sym) - return &drgn_enomem; - drgn_symbol_from_elf(name, addr - offset, &elf_sym, sym); - if (!drgn_symbol_result_builder_add(builder, sym)) { - arg.err = &drgn_enomem; - drgn_symbol_destroy(sym); - } - } else if (dwfl_module) { - elf_symbols_search_cb(dwfl_module, NULL, NULL, 0, &arg); - } else { - dwfl_getmodules(prog->dbinfo.dwfl, elf_symbols_search_cb, &arg, 0); - } - return arg.err; -} - static struct drgn_error * drgn_program_symbols_search(struct drgn_program *prog, const char *name, uint64_t addr, enum drgn_find_symbol_flags flags, diff --git a/libdrgn/program.h b/libdrgn/program.h index 636ee2445..2f6c1f5de 100644 --- a/libdrgn/program.h +++ b/libdrgn/program.h @@ -365,13 +365,6 @@ drgn_program_find_symbol_by_address_internal(struct drgn_program *prog, uint64_t address, struct drgn_symbol **ret); -/* - * Implementation of the Symbol finder API, based on ELF symbols - */ -struct drgn_error * -elf_symbols_search(const char *name, uint64_t addr, enum drgn_find_symbol_flags flags, - void *data, struct drgn_symbol_result_builder *builder); - struct drgn_error * drgn_program_add_symbol_finder_impl(struct drgn_program *prog, struct drgn_symbol_finder *finder, From ca6684ceca1c6dcd44c9dc4780c6b939257bf885 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 06/10] python: Allow construction of Symbol objects Previously, Symbol objects could not be constructed in Python. However, in order to allow Python Symbol finders, this needs to be changed. Unfortunately, Symbol name lifetimes are tricky to manage. We introduce a lifetime enumeration to handle this. The lifetime may be "static", i.e. longer than the life of the program; "external", i.e. longer than the life of the symbol, but no guarantees beyond that; or "owned", i.e. owned by the Symbol itself. Symbol objects constructed in Python are "external". The Symbol struct owns the pointer to the drgn_symbol, and it holds a reference to the Python object keeping the name valid (either the program, or a PyUnicode object). The added complexity is justified by the fact that most symbols are from the ELF file, and thus share a lifetime with the Program. It would be a waste to constantly strdup() these strings, just to support a small number of Symbols created by Python code. Signed-off-by: Stephen Brennan --- libdrgn/drgn.h | 38 ++++++++++++++++++++++++++++ libdrgn/python/drgnpy.h | 4 +-- libdrgn/python/program.c | 4 +-- libdrgn/python/stack_trace.c | 2 +- libdrgn/python/symbol.c | 48 +++++++++++++++++++++++++++++++++--- libdrgn/symbol.c | 24 ++++++++++++++++++ libdrgn/symbol.h | 1 + tests/test_symbol.py | 41 +++++++----------------------- 8 files changed, 121 insertions(+), 41 deletions(-) diff --git a/libdrgn/drgn.h b/libdrgn/drgn.h index b042b5659..705592f2b 100644 --- a/libdrgn/drgn.h +++ b/libdrgn/drgn.h @@ -2967,6 +2967,44 @@ enum drgn_symbol_kind { DRGN_SYMBOL_KIND_IFUNC = 10, /* STT_GNU_IFUNC */ } __attribute__((__packed__)); +/** Describes the lifetime of an object provided to drgn */ +enum drgn_lifetime { + /** + * DRGN_LIFETIME_STATIC: the object is guaranteed to outlive the + * drgn_program itself. drgn will not free or copy the object. + */ + DRGN_LIFETIME_STATIC, + /** + * DRGN_LIFETIME_EXTERNAL: the object is externally managed. It will + * live as long as the object it is associated with, but may be freed + * after. drgn will never free the object. If drgn must copy a data + * structure, the object will be duplicated, and drgn will own the new + * object. + */ + DRGN_LIFETIME_EXTERNAL, + /** + * DRGN_LIFETIME_OWNED: the object lifetime is managed by drgn. It + * should be freed when the containing object is freed. If the + * containing object is copied, it must also be copied. + */ + DRGN_LIFETIME_OWNED, +} __attribute__((__packed__)); + +/** + * Create a new @ref drgn_symbol with the given values + * + * All parameters should be self-explanatory, except for @a name_lifetime. + * Clients can use this to describe how drgn should treat the string @a name. + * Strings with lifetime @c STATIC will never be copied or freed. Strings with + * lifetime @c OWNED will always be copied or and freed with the symbol. Strings + * with lifetime EXTERNAL will not be freed, but if the Symbol is copied, they + * will be copied. + */ +struct drgn_error * +drgn_symbol_create(const char *name, uint64_t address, uint64_t size, + enum drgn_symbol_binding binding, enum drgn_symbol_kind kind, + enum drgn_lifetime name_lifetime, struct drgn_symbol **ret); + /** Destroy a @ref drgn_symbol. */ void drgn_symbol_destroy(struct drgn_symbol *sym); diff --git a/libdrgn/python/drgnpy.h b/libdrgn/python/drgnpy.h index d3c3e8505..c745eab6f 100644 --- a/libdrgn/python/drgnpy.h +++ b/libdrgn/python/drgnpy.h @@ -173,7 +173,7 @@ typedef struct { typedef struct { PyObject_HEAD - Program *prog; + PyObject *name_obj; /* object owning the reference to the symbol name */ struct drgn_symbol *sym; } Symbol; @@ -288,7 +288,7 @@ Program *program_from_core_dump(PyObject *self, PyObject *args, PyObject *kwds); Program *program_from_kernel(PyObject *self); Program *program_from_pid(PyObject *self, PyObject *args, PyObject *kwds); -PyObject *Symbol_wrap(struct drgn_symbol *sym, Program *prog); +PyObject *Symbol_wrap(struct drgn_symbol *sym, PyObject *name_obj); PyObject *Thread_wrap(struct drgn_thread *drgn_thread); diff --git a/libdrgn/python/program.c b/libdrgn/python/program.c index af62218c4..3e2e1d4f3 100644 --- a/libdrgn/python/program.c +++ b/libdrgn/python/program.c @@ -900,7 +900,7 @@ static PyObject *Program_symbols(Program *self, PyObject *args) return NULL; } for (size_t i = 0; i < count; i++) { - PyObject *pysym = Symbol_wrap(symbols[i], self); + PyObject *pysym = Symbol_wrap(symbols[i], (PyObject *)self); if (!pysym) { /* Free symbols which aren't yet added to list. */ drgn_symbols_destroy(symbols, count); @@ -936,7 +936,7 @@ static PyObject *Program_symbol(Program *self, PyObject *arg) } if (err) return set_drgn_error(err); - ret = Symbol_wrap(sym, self); + ret = Symbol_wrap(sym, (PyObject *)self); if (!ret) { drgn_symbol_destroy(sym); return NULL; diff --git a/libdrgn/python/stack_trace.c b/libdrgn/python/stack_trace.c index 3e43b182a..1112a8092 100644 --- a/libdrgn/python/stack_trace.c +++ b/libdrgn/python/stack_trace.c @@ -209,7 +209,7 @@ static PyObject *StackFrame_symbol(StackFrame *self) err = drgn_stack_frame_symbol(self->trace->trace, self->i, &sym); if (err) return set_drgn_error(err); - PyObject *ret = Symbol_wrap(sym, prog); + PyObject *ret = Symbol_wrap(sym, (PyObject *)prog); if (!ret) { drgn_symbol_destroy(sym); return NULL; diff --git a/libdrgn/python/symbol.c b/libdrgn/python/symbol.c index ea0d98cb3..3e0b0f26f 100644 --- a/libdrgn/python/symbol.c +++ b/libdrgn/python/symbol.c @@ -3,23 +3,62 @@ #include +#include "drgn.h" #include "drgnpy.h" +#include "symbol.h" -PyObject *Symbol_wrap(struct drgn_symbol *sym, Program *prog) +PyObject *Symbol_wrap(struct drgn_symbol *sym, PyObject *name_obj) { Symbol *ret = call_tp_alloc(Symbol); if (ret) { ret->sym = sym; - ret->prog = prog; - Py_INCREF(prog); + ret->name_obj = name_obj; + Py_XINCREF(name_obj); } return (PyObject *)ret; } +static PyObject *Symbol_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) +{ + struct drgn_symbol *sym; + static char *keywords[] = {"name", "address", "size", "binding", "kind", NULL}; + PyObject *name_obj; + struct index_arg address = {}, size = {}; + struct enum_arg binding = { + .type = SymbolBinding_class, + }; + struct enum_arg kind = { + .type = SymbolKind_class, + }; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!O&O&O&O&:Symbol", keywords, + &PyUnicode_Type, &name_obj, + index_converter, &address, + index_converter, &size, + enum_converter, &binding, + enum_converter, &kind)) + return NULL; + + const char *name = PyUnicode_AsUTF8(name_obj); + if (!name) + return NULL; + + struct drgn_error *err = drgn_symbol_create( + name, address.uvalue,size.uvalue, binding.value, kind.value, + DRGN_LIFETIME_EXTERNAL, &sym); + if (err) + return set_drgn_error(err); + + PyObject *ret = Symbol_wrap(sym, name_obj); + if (!ret) + drgn_symbol_destroy(sym); + return ret; +} + static void Symbol_dealloc(Symbol *self) { drgn_symbol_destroy(self->sym); - Py_XDECREF(self->prog); + Py_XDECREF(self->name_obj); Py_TYPE(self)->tp_free((PyObject *)self); } @@ -100,4 +139,5 @@ PyTypeObject Symbol_type = { .tp_doc = drgn_Symbol_DOC, .tp_richcompare = (richcmpfunc)Symbol_richcompare, .tp_getset = Symbol_getset, + .tp_new = Symbol_new, }; diff --git a/libdrgn/symbol.c b/libdrgn/symbol.c index e6097b442..53ba15545 100644 --- a/libdrgn/symbol.c +++ b/libdrgn/symbol.c @@ -11,6 +11,11 @@ LIBDRGN_PUBLIC void drgn_symbol_destroy(struct drgn_symbol *sym) { + if (sym && sym->name_lifetime == DRGN_LIFETIME_OWNED) + /* Cast here is necessary - we want symbol users to + * never modify sym->name, but when we own the name, + * we must modify it by freeing it. */ + free((char *)sym->name); free(sym); } @@ -26,6 +31,7 @@ void drgn_symbol_from_elf(const char *name, uint64_t address, const GElf_Sym *elf_sym, struct drgn_symbol *ret) { ret->name = name; + ret->name_lifetime = DRGN_LIFETIME_STATIC; ret->address = address; ret->size = elf_sym->st_size; int binding = GELF_ST_BIND(elf_sym->st_info); @@ -40,6 +46,24 @@ void drgn_symbol_from_elf(const char *name, uint64_t address, ret->kind = DRGN_SYMBOL_KIND_UNKNOWN; } +LIBDRGN_PUBLIC struct drgn_error * +drgn_symbol_create(const char *name, uint64_t address, uint64_t size, + enum drgn_symbol_binding binding, enum drgn_symbol_kind kind, + enum drgn_lifetime name_lifetime, struct drgn_symbol **ret) +{ + struct drgn_symbol *sym = malloc(sizeof(*sym)); + if (!sym) + return &drgn_enomem; + sym->name = name; + sym->address = address; + sym->size = size; + sym->binding = binding; + sym->kind = kind; + sym->name_lifetime = name_lifetime; + *ret = sym; + return NULL; +} + LIBDRGN_PUBLIC const char *drgn_symbol_name(struct drgn_symbol *sym) { return sym->name; diff --git a/libdrgn/symbol.h b/libdrgn/symbol.h index 495566307..1c75e64af 100644 --- a/libdrgn/symbol.h +++ b/libdrgn/symbol.h @@ -16,6 +16,7 @@ struct drgn_symbol { uint64_t size; enum drgn_symbol_binding binding; enum drgn_symbol_kind kind; + enum drgn_lifetime name_lifetime; }; struct drgn_symbol_finder { diff --git a/tests/test_symbol.py b/tests/test_symbol.py index 3ff949325..a8a55d21a 100644 --- a/tests/test_symbol.py +++ b/tests/test_symbol.py @@ -1,9 +1,8 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # SPDX-License-Identifier: LGPL-2.1-or-later import tempfile -from typing import NamedTuple -from drgn import Program, SymbolBinding, SymbolKind +from drgn import Program, Symbol, SymbolBinding, SymbolKind from tests import TestCase from tests.dwarfwriter import dwarf_sections from tests.elf import ET, PT, SHT, STB, STT @@ -45,35 +44,13 @@ def elf_symbol_program(*modules): return prog -# We don't want to support creating drgn.Symbol instances yet, so use this dumb -# class for testing. -class Symbol(NamedTuple): - name: str - address: int - size: int - binding: SymbolBinding - kind: SymbolKind - - class TestElfSymbol(TestCase): - def assert_symbol_equal(self, drgn_symbol, symbol): - self.assertEqual( - Symbol( - drgn_symbol.name, - drgn_symbol.address, - drgn_symbol.size, - drgn_symbol.binding, - drgn_symbol.kind, - ), - symbol, - ) - def assert_symbols_equal_unordered(self, drgn_symbols, symbols): self.assertEqual(len(drgn_symbols), len(symbols)) drgn_symbols = sorted(drgn_symbols, key=lambda x: (x.address, x.name)) symbols = sorted(symbols, key=lambda x: (x.address, x.name)) for drgn_symbol, symbol in zip(drgn_symbols, symbols): - self.assert_symbol_equal(drgn_symbol, symbol) + self.assertEqual(drgn_symbol, symbol) def test_by_address(self): elf_first = ElfSymbol("first", 0xFFFF0000, 0x8, STT.OBJECT, STB.LOCAL) @@ -91,13 +68,13 @@ def test_by_address(self): prog = elf_symbol_program(*modules) self.assertRaises(LookupError, prog.symbol, 0xFFFEFFFF) self.assertEqual(prog.symbols(0xFFFEFFFF), []) - self.assert_symbol_equal(prog.symbol(0xFFFF0000), first) + self.assertEqual(prog.symbol(0xFFFF0000), first) self.assert_symbols_equal_unordered(prog.symbols(0xFFFF0000), [first]) - self.assert_symbol_equal(prog.symbol(0xFFFF0004), first) + self.assertEqual(prog.symbol(0xFFFF0004), first) self.assert_symbols_equal_unordered(prog.symbols(0xFFFF0004), [first]) - self.assert_symbol_equal(prog.symbol(0xFFFF0008), second) + self.assertEqual(prog.symbol(0xFFFF0008), second) self.assert_symbols_equal_unordered(prog.symbols(0xFFFF0008), [second]) - self.assert_symbol_equal(prog.symbol(0xFFFF000C), second) + self.assertEqual(prog.symbol(0xFFFF000C), second) self.assert_symbols_equal_unordered(prog.symbols(0xFFFF000C), [second]) self.assertRaises(LookupError, prog.symbol, 0xFFFF0010) @@ -171,8 +148,8 @@ def test_by_name(self): for modules in same_module, different_modules: with self.subTest(modules=len(modules)): prog = elf_symbol_program(*modules) - self.assert_symbol_equal(prog.symbol("first"), first) - self.assert_symbol_equal(prog.symbol("second"), second) + self.assertEqual(prog.symbol("first"), first) + self.assertEqual(prog.symbol("second"), second) self.assertRaises(LookupError, prog.symbol, "third") self.assert_symbols_equal_unordered(prog.symbols("first"), [first]) @@ -258,7 +235,7 @@ def test_kind(self): (ElfSymbol("foo", 0xFFFF0000, 1, elf_type, STB.GLOBAL),) ) symbol = Symbol("foo", 0xFFFF0000, 1, SymbolBinding.GLOBAL, drgn_kind) - self.assert_symbol_equal(prog.symbol("foo"), symbol) + self.assertEqual(prog.symbol("foo"), symbol) symbols = prog.symbols("foo") self.assert_symbols_equal_unordered(symbols, [symbol]) From 50a2377891e8ca788545e2d9c373a77628225f42 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 07/10] python: Add Program.add_symbol_finder() Expose the Symbol finder API so that Python code can be used to lookup additional symbols by name or address. Signed-off-by: Stephen Brennan --- _drgn.pyi | 27 ++++++++++++ libdrgn/python/program.c | 94 ++++++++++++++++++++++++++++++++++++++++ libdrgn/symbol.c | 19 ++++++++ libdrgn/symbol.h | 3 ++ 4 files changed, 143 insertions(+) diff --git a/_drgn.pyi b/_drgn.pyi index 9863725d3..47dc51411 100644 --- a/_drgn.pyi +++ b/_drgn.pyi @@ -453,6 +453,33 @@ class Program: return an :class:`Object` or ``None`` if not found. """ ... + def add_symbol_finder( + self, fn: Callable[[Optional[str], Optional[int], bool], Sequence[Symbol]] + ) -> None: + """ + Register a callback for finding symbols in the program. + + The callback should take three arguments: a search name, a search + address, and a boolean flag 'one' indicating whether to return only + the single best match. When the 'one' flag is True, the callback should + return a list containing at most one :class:`Symbol`. When the flag is + False, the callback should return a list of all matching + :class:`Symbol`\\ s. Both the name and address arguments are optional. + If both are provided, then the result(s) should match both. If neither + are provided, the finder should return all available symbols. If no + result is found, the return should be an empty list. + + Callbacks are called in reverse order of the order they were added + (i.e,, the most recently added callback is called first). When the + 'one' flag is set, the search will short-circuit after the first + finder which returns a result, and subsequent finders will not be + called. Otherwise, all callbacks will be called, and all results will be + returned. + + :param fn: Callable taking name, address, and 'one' flag, and + returning a sequence of :class:`Symbol`\\ s. + """ + ... def set_core_dump(self, path: Union[Path, int]) -> None: """ Set the program to a core dump. diff --git a/libdrgn/python/program.c b/libdrgn/python/program.c index 3e2e1d4f3..eb024774a 100644 --- a/libdrgn/python/program.c +++ b/libdrgn/python/program.c @@ -477,6 +477,70 @@ static struct drgn_error *py_object_find_fn(const char *name, size_t name_len, return drgn_object_copy(ret, &((DrgnObject *)obj)->obj); } +static struct drgn_error *py_symbol_find_fn(const char *name, uint64_t addr, + enum drgn_find_symbol_flags flags, + void *data, struct drgn_symbol_result_builder *builder) +{ + PyGILState_guard(); + + _cleanup_pydecref_ PyObject *name_obj = NULL; + if (flags & DRGN_FIND_SYMBOL_NAME) { + name_obj = PyUnicode_FromString(name); + if (!name_obj) + return drgn_error_from_python(); + } else { + name_obj = Py_None; + Py_INCREF(name_obj); + } + + _cleanup_pydecref_ PyObject *address_obj = NULL; + if (flags & DRGN_FIND_SYMBOL_ADDR) { + address_obj = PyLong_FromUnsignedLong(addr); + if (!address_obj) + return drgn_error_from_python(); + } else { + address_obj = Py_None; + Py_INCREF(address_obj); + } + + _cleanup_pydecref_ PyObject *one_obj = PyBool_FromLong(flags & DRGN_FIND_SYMBOL_ONE); + + _cleanup_pydecref_ PyObject *tmp = PyObject_CallFunction(data, "OOO", name_obj, + address_obj, one_obj); + if (!tmp) + return drgn_error_from_python(); + + _cleanup_pydecref_ PyObject *obj = + PySequence_Fast(tmp, "symbol finder must return a sequence"); + if (!obj) + return drgn_error_from_python(); + + size_t len = PySequence_Fast_GET_SIZE(obj); + if (len > 1 && (flags & DRGN_FIND_SYMBOL_ONE)) { + return drgn_error_create(DRGN_ERROR_INVALID_ARGUMENT, + "symbol finder returned multiple elements, but one was requested"); + } + + for (size_t i = 0; i < len; i++) { + PyObject *item = PySequence_Fast_GET_ITEM(obj, i); + if (!PyObject_TypeCheck(item, &Symbol_type)) + return drgn_error_create(DRGN_ERROR_TYPE, + "symbol finder results must be of type Symbol"); + _cleanup_free_ struct drgn_symbol *sym = malloc(sizeof(*sym)); + if (!sym) + return &drgn_enomem; + struct drgn_error *err = drgn_symbol_copy(sym, ((Symbol *)item)->sym); + if (err) + return err; + + if (!drgn_symbol_result_builder_add(builder, sym)) + return &drgn_enomem; + sym = NULL; // owned by the builder now + } + + return NULL; +} + static PyObject *Program_add_object_finder(Program *self, PyObject *args, PyObject *kwds) { @@ -506,6 +570,34 @@ static PyObject *Program_add_object_finder(Program *self, PyObject *args, Py_RETURN_NONE; } +static PyObject *Program_add_symbol_finder(Program *self, PyObject *args, + PyObject *kwds) +{ + static char *keywords[] = {"fn", NULL}; + struct drgn_error *err; + PyObject *fn; + int ret; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:add_symbol_finder", + keywords, &fn)) + return NULL; + + if (!PyCallable_Check(fn)) { + PyErr_SetString(PyExc_TypeError, "fn must be callable"); + return NULL; + } + + ret = Program_hold_object(self, fn); + if (ret == -1) + return NULL; + + err = drgn_program_add_symbol_finder(&self->prog, py_symbol_find_fn, + fn); + if (err) + return set_drgn_error(err); + Py_RETURN_NONE; +} + static PyObject *Program_set_core_dump(Program *self, PyObject *args, PyObject *kwds) { @@ -1120,6 +1212,8 @@ static PyMethodDef Program_methods[] = { METH_VARARGS | METH_KEYWORDS, drgn_Program_add_type_finder_DOC}, {"add_object_finder", (PyCFunction)Program_add_object_finder, METH_VARARGS | METH_KEYWORDS, drgn_Program_add_object_finder_DOC}, + {"add_symbol_finder", (PyCFunction)Program_add_symbol_finder, + METH_VARARGS | METH_KEYWORDS, drgn_Program_add_symbol_finder_DOC}, {"set_core_dump", (PyCFunction)Program_set_core_dump, METH_VARARGS | METH_KEYWORDS, drgn_Program_set_core_dump_DOC}, {"set_kernel", (PyCFunction)Program_set_kernel, METH_NOARGS, diff --git a/libdrgn/symbol.c b/libdrgn/symbol.c index 53ba15545..581fdc265 100644 --- a/libdrgn/symbol.c +++ b/libdrgn/symbol.c @@ -46,6 +46,25 @@ void drgn_symbol_from_elf(const char *name, uint64_t address, ret->kind = DRGN_SYMBOL_KIND_UNKNOWN; } +struct drgn_error * +drgn_symbol_copy(struct drgn_symbol *dst, struct drgn_symbol *src) +{ + if (src->name_lifetime == DRGN_LIFETIME_STATIC) { + dst->name = src->name; + dst->name_lifetime = DRGN_LIFETIME_STATIC; + } else { + dst->name = strdup(src->name); + if (!dst->name) + return &drgn_enomem; + dst->name_lifetime = DRGN_LIFETIME_OWNED; + } + dst->address = src->address; + dst->size = src->size; + dst->kind = src->kind; + dst->binding = src->binding; + return NULL; +} + LIBDRGN_PUBLIC struct drgn_error * drgn_symbol_create(const char *name, uint64_t address, uint64_t size, enum drgn_symbol_binding binding, enum drgn_symbol_kind kind, diff --git a/libdrgn/symbol.h b/libdrgn/symbol.h index 1c75e64af..b2e880af4 100644 --- a/libdrgn/symbol.h +++ b/libdrgn/symbol.h @@ -61,4 +61,7 @@ drgn_symbol_result_builder_single(struct drgn_symbol_result_builder *builder); void drgn_symbol_result_builder_array(struct drgn_symbol_result_builder *builder, struct drgn_symbol ***syms_ret, size_t *count_ret); +struct drgn_error * +drgn_symbol_copy(struct drgn_symbol *dst, struct drgn_symbol *src); + #endif /* DRGN_SYMBOL_H */ From 902e30f5a29b9577b7188964cf4380a0f6ac1001 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 08/10] Add test for Symbol Finder API Specify a "fake" symbol finder and then test that its results are plumbed through the API successfully. While this is a contrived test, it helps build confidence in the plumbing of the API. Signed-off-by: Stephen Brennan --- tests/test_symbol.py | 80 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/test_symbol.py b/tests/test_symbol.py index a8a55d21a..72c35af1d 100644 --- a/tests/test_symbol.py +++ b/tests/test_symbol.py @@ -263,3 +263,83 @@ def test_all_symbols(self): ] prog = elf_symbol_program(*elf_syms) self.assert_symbols_equal_unordered(prog.symbols(), syms) + + +class TestSymbolFinder(TestCase): + TEST_SYMS = [ + Symbol("one", 0xFFFF1000, 16, SymbolBinding.LOCAL, SymbolKind.FUNC), + Symbol("two", 0xFFFF2000, 16, SymbolBinding.GLOBAL, SymbolKind.FUNC), + Symbol("three", 0xFFFF2008, 8, SymbolBinding.GLOBAL, SymbolKind.FUNC), + ] + + def finder(self, arg_name, arg_address, arg_one): + self.called = True + res = [] + self.assertEqual(self.expected_name, arg_name) + self.assertEqual(self.expected_address, arg_address) + self.assertEqual(self.expected_one, arg_one) + for sym in self.TEST_SYMS: + if arg_name and sym.name == arg_name: + res.append(sym) + elif arg_address and sym.address <= arg_address < sym.address + sym.size: + res.append(sym) + elif not arg_name and not arg_address: + res.append(sym) + + # This symbol finder intentionally has a bug: it does not respect the + # "arg_one" flag: it may return multiple symbols even when "arg_one" is + # true. + return res + + def setUp(self): + self.prog = Program() + self.prog.add_symbol_finder(self.finder) + self.called = False + + def expect_args(self, name, address, one): + self.expected_name = name + self.expected_address = address + self.expected_one = one + + def test_args_single_string(self): + self.expect_args("search_symbol", None, True) + with self.assertRaises(LookupError): + self.prog.symbol("search_symbol") + self.assertTrue(self.called) + + def test_args_single_int(self): + self.expect_args(None, 0xFF00, True) + with self.assertRaises(LookupError): + self.prog.symbol(0xFF00) + self.assertTrue(self.called) + + def test_args_single_with_many_results(self): + self.expect_args(None, 0xFFFF2008, True) + with self.assertRaises(ValueError): + self.prog.symbol(0xFFFF2008) + self.assertTrue(self.called) + + def test_single_with_result(self): + self.expect_args("one", None, True) + self.assertEqual(self.prog.symbol("one"), self.TEST_SYMS[0]) + self.assertTrue(self.called) + + def test_args_many_string(self): + self.expect_args("search_symbol", None, False) + self.assertEqual(self.prog.symbols("search_symbol"), []) + self.assertTrue(self.called) + + def test_args_many_int(self): + self.expect_args(None, 0xFF00, False) + self.assertEqual(self.prog.symbols(0xFF00), []) + self.assertTrue(self.called) + + def test_many_with_result(self): + self.expect_args(None, 0xFFFF2004, False) + self.assertEqual(self.prog.symbols(0xFFFF2004), [self.TEST_SYMS[1]]) + self.assertTrue(self.called) + + def test_many_without_filter(self): + self.expect_args(None, None, False) + self.assertEqual(self.prog.symbols(), self.TEST_SYMS) + self.assertTrue(self.called) From 85f31e64639745bb69d764f96a01abb1c1fd727b Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 09/10] libdrgn: Add helper for returning list of Symbols Signed-off-by: Stephen Brennan --- libdrgn/python/drgnpy.h | 2 ++ libdrgn/python/program.c | 18 +----------------- libdrgn/python/symbol.c | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/libdrgn/python/drgnpy.h b/libdrgn/python/drgnpy.h index c745eab6f..56608e575 100644 --- a/libdrgn/python/drgnpy.h +++ b/libdrgn/python/drgnpy.h @@ -289,6 +289,8 @@ Program *program_from_kernel(PyObject *self); Program *program_from_pid(PyObject *self, PyObject *args, PyObject *kwds); PyObject *Symbol_wrap(struct drgn_symbol *sym, PyObject *name_obj); +PyObject *Symbol_list_wrap(struct drgn_symbol **symbols, size_t count, + Program *prog); PyObject *Thread_wrap(struct drgn_thread *drgn_thread); diff --git a/libdrgn/python/program.c b/libdrgn/python/program.c index eb024774a..c3b11ab2d 100644 --- a/libdrgn/python/program.c +++ b/libdrgn/python/program.c @@ -986,23 +986,7 @@ static PyObject *Program_symbols(Program *self, PyObject *args) if (err) return set_drgn_error(err); - _cleanup_pydecref_ PyObject *list = PyList_New(count); - if (!list) { - drgn_symbols_destroy(symbols, count); - return NULL; - } - for (size_t i = 0; i < count; i++) { - PyObject *pysym = Symbol_wrap(symbols[i], (PyObject *)self); - if (!pysym) { - /* Free symbols which aren't yet added to list. */ - drgn_symbols_destroy(symbols, count); - return NULL; - } - symbols[i] = NULL; - PyList_SET_ITEM(list, i, pysym); - } - free(symbols); - return_ptr(list); + return Symbol_list_wrap(symbols, count, self); } static PyObject *Program_symbol(Program *self, PyObject *arg) diff --git a/libdrgn/python/symbol.c b/libdrgn/python/symbol.c index 3e0b0f26f..53946a5a1 100644 --- a/libdrgn/python/symbol.c +++ b/libdrgn/python/symbol.c @@ -18,6 +18,29 @@ PyObject *Symbol_wrap(struct drgn_symbol *sym, PyObject *name_obj) return (PyObject *)ret; } +PyObject *Symbol_list_wrap(struct drgn_symbol **symbols, size_t count, + Program *prog) +{ + _cleanup_pydecref_ PyObject *list = PyList_New(count); + if (!list) { + drgn_symbols_destroy(symbols, count); + return NULL; + } + for (size_t i = 0; i < count; i++) { + PyObject *pysym = Symbol_wrap(symbols[i], (PyObject *)prog); + if (!pysym) { + /* Free symbols which aren't yet added to list. */ + drgn_symbols_destroy(symbols, count); + /* Free list and all symbols already added. */ + return NULL; + } + symbols[i] = NULL; + PyList_SET_ITEM(list, i, pysym); + } + free(symbols); + return_ptr(list); +} + static PyObject *Symbol_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) { struct drgn_symbol *sym; From 06f5e86a710a84024851e8c823518fac595598ad Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 1 Mar 2024 16:46:53 -0800 Subject: [PATCH 10/10] kallsyms: add symbol finder for live & coredump The Linux kernel can be configured to include kallsyms, a built-in compressed symbol table which is also exposed at /proc/kallsyms. The symbol table contains most (but not all) of the ELF symbol table information. It can be used as a Symbol finder. The kallsyms information can be extracted in two ways: for live systems where we have root access, the simplest approach is to simply read /proc/kallsyms. For vmcores, or live systems where we are not root, we must parse the data from the vmcore, which is significantly more involved. To avoid tying the kallsyms system too deeply into the drgn internals, the finder is exposed as a Python class, which must be created using symbol information from the vmcoreinfo. Attaching the KallsymsFinder to the program will attach the underlying C function, so we can avoid some of the inefficiencies of the Python API. Signed-off-by: Stephen Brennan --- _drgn.pyi | 67 +++ docs/api_reference.rst | 1 + drgn/__init__.py | 2 + drgn/helpers/linux/kallsyms.py | 58 ++ libdrgn/Makefile.am | 3 + libdrgn/kallsyms.c | 891 +++++++++++++++++++++++++++++++ libdrgn/kallsyms.h | 134 +++++ libdrgn/python/drgnpy.h | 6 + libdrgn/python/kallsyms_finder.c | 147 +++++ libdrgn/python/main.c | 1 + libdrgn/python/program.c | 12 +- 11 files changed, 1320 insertions(+), 2 deletions(-) create mode 100644 drgn/helpers/linux/kallsyms.py create mode 100644 libdrgn/kallsyms.c create mode 100644 libdrgn/kallsyms.h create mode 100644 libdrgn/python/kallsyms_finder.c diff --git a/_drgn.pyi b/_drgn.pyi index 47dc51411..2e7217726 100644 --- a/_drgn.pyi +++ b/_drgn.pyi @@ -1612,6 +1612,73 @@ class Symbol: kind: Final[SymbolKind] """Kind of entity represented by this symbol.""" +class KallsymsFinder: + """ + A symbol finder which uses vmlinux kallsyms data + """ + + def __init__( + self, + prog: Program, + kallsyms_names: int, + kallsyms_token_table: int, + kallsyms_token_index: int, + kallsyms_num_syms: int, + kallsyms_offsets: int, + kallsyms_relative_base: int, + kallsyms_addresses: int, + _stext: int, + ) -> None: + """ + Manually construct a ``KallsymsFinder`` given all symbol addresses + + .. note:: + + This class should not normally be instantiated manually. See + :func:`drgn.helpers.linux.kallsyms.make_kallsyms_vmlinux_finder` + instead for a way of automatically creating the finder via + information found in the ``VMCOREINFO``. + + The finder is capable of searching the compressed table of symbol names + and addresses stored within kernel memory. It requires + ``CONFIG_KALLSYMS=y`` and ``CONFIG_KALLSYMS_ALL=y`` in your kernel + configuration -- this is common on desktop and server Linux + distributions. However, the quality of symbol information is not + excellent: the :meth:`Symbol.binding` and :meth:`Symbol.kind` values are + inferred from type code information provided by kallsyms which was + originally generated by ``nm(1)``. Further, the :meth:`Symbol.size` is + computed using the offset of the next symbol after it in memory. This + can create some unusual results. + + In order to create a ``KallsymsFinder``, drgn must know the location of + several symbols, which creates a bit of a chicken-and-egg problem. + Thankfully, starting with Linux 6.0, these symbol addresses are included + in the VMCOREINFO note. The required symbols are addresses of variables + in the vmcore: + + - ``kallsyms_names``: an array of compressed symbol name data. + - ``kallsyms_token_table``, ``kallsyms_token_index``: tables used in + decompressing symbol names. + - ``kallsyms_num_syms``: the number of kallsyms symbols + - ``_stext``: the start of the kernel text segment. This symbol addresss + is necessary for verifying decoded kallsyms data. + + Depending on the way that kallsyms is configured (see + ``CONFIG_KALLSYMS_ABSOLUTE_PERCPU`` and + ``CONFIG_KALLSYMS_BASE_RELATIVE``), the following symbols are needed. If + the symbol names are not present, they should be given as zero. + + - ``kallsyms_offsets`` + - ``kallsyms_realtive_base`` + - ``kallsyms_addresses`` + + :param prog: Program to create a finder for + :returns: A callable object suitable to provide to + :meth:`Program.add_symbol_finder()`. + """ + __call__: Callable[[Optional[str], Optional[int], bool], List[Symbol]] + """Lookup symbol by name, address, or both.""" + class SymbolBinding(enum.Enum): """ A ``SymbolBinding`` describes the linkage behavior and visibility of a diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 16c0e65a4..ffbfb379b 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -108,6 +108,7 @@ Symbols .. drgndoc:: Symbol .. drgndoc:: SymbolBinding .. drgndoc:: SymbolKind +.. drgndoc:: KallsymsFinder Stack Traces ------------ diff --git a/drgn/__init__.py b/drgn/__init__.py index 1df95b5fd..64b060be7 100644 --- a/drgn/__init__.py +++ b/drgn/__init__.py @@ -52,6 +52,7 @@ FaultError, FindObjectFlags, IntegerLike, + KallsymsFinder, Language, MissingDebugInfoError, NoDefaultProgramError, @@ -105,6 +106,7 @@ "FaultError", "FindObjectFlags", "IntegerLike", + "KallsymsFinder", "Language", "MissingDebugInfoError", "NULL", diff --git a/drgn/helpers/linux/kallsyms.py b/drgn/helpers/linux/kallsyms.py new file mode 100644 index 000000000..52d9a04b2 --- /dev/null +++ b/drgn/helpers/linux/kallsyms.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright (c) 2023 Oracle and/or its affiliates +# SPDX-License-Identifier: LGPL-2.1-or-later +""" +Kallsyms +-------- + +The kallsyms module contains helpers which allow you to use the built-in +kallsyms symbol table for drgn object lookup. Combined with an alternative type +information source, this can enable debugging Linux kernel core dumps without +the corresponding DWARF debuginfo files. +""" +import re +from typing import Dict + +from drgn import KallsymsFinder, Program + +__all__ = ("make_kallsyms_vmlinux_finder",) + + +def _vmcoreinfo_symbols(prog: Program) -> Dict[str, int]: + vmcoreinfo_data = prog["VMCOREINFO"].string_().decode("ascii") + vmcoreinfo_symbols = {} + sym_re = re.compile(r"SYMBOL\(([^)]+)\)=([A-Fa-f0-9]+)") + for line in vmcoreinfo_data.strip().split("\n"): + match = sym_re.fullmatch(line) + if match: + vmcoreinfo_symbols[match.group(1)] = int(match.group(2), 16) + return vmcoreinfo_symbols + + +def make_kallsyms_vmlinux_finder(prog: Program) -> KallsymsFinder: + """ + Create a vmlinux kallsyms finder, which may be passed to + :meth:`drgn.Program.add_symbol_finder`. + + This function automatically finds the necessary information to create a + ``KallsymsFinder`` from the program's VMCOREINFO data. It may fail if the + information is not present. Please note that the debugged Linux kernel must + be 6.0 or later to find this information. + + :returns: a callable symbol finder object + """ + symbol_reqd = [ + "kallsyms_names", + "kallsyms_token_table", + "kallsyms_token_index", + "kallsyms_num_syms", + "kallsyms_offsets", + "kallsyms_relative_base", + "kallsyms_addresses", + "_stext", + ] + symbols = _vmcoreinfo_symbols(prog) + args = [] + for sym in symbol_reqd: + args.append(symbols.get(sym, 0)) + return KallsymsFinder(prog, *args) diff --git a/libdrgn/Makefile.am b/libdrgn/Makefile.am index dfa706374..ce1888259 100644 --- a/libdrgn/Makefile.am +++ b/libdrgn/Makefile.am @@ -66,6 +66,8 @@ libdrgnimpl_la_SOURCES = $(ARCH_DEFS_PYS:_defs.py=.c) \ helpers.h \ io.c \ io.h \ + kallsyms.c \ + kallsyms.h \ language.c \ language.h \ language_c.c \ @@ -157,6 +159,7 @@ _drgn_la_SOURCES = python/constants.c \ python/drgnpy.h \ python/error.c \ python/helpers.c \ + python/kallsyms_finder.c \ python/language.c \ python/main.c \ python/object.c \ diff --git a/libdrgn/kallsyms.c b/libdrgn/kallsyms.c new file mode 100644 index 000000000..77a545a9f --- /dev/null +++ b/libdrgn/kallsyms.c @@ -0,0 +1,891 @@ +// Copyright (c) 2023 Oracle and/or its affiliates +// SPDX-License-Identifier: LGPL-2.1-or-later + +#include +#include + +#include "kallsyms.h" +#include "program.h" +#include "drgn.h" + +/** + * This struct contains the tables necessary to reconstruct kallsyms names. + * + * vmlinux (core kernel) kallsyms names are compressed using table compression. + * There is some description of it in the kernel's "scripts/kallsyms.c", but + * this is a brief overview that should make the code below comprehensible. + * + * Table compression uses the remaining 128 characters not defined by ASCII and + * maps them to common substrings (e.g. the prefix "write_"). Each name is + * represented as a sequence of bytes which refers to strings in this table. + * The two arrays below comprise this table: + * + * - token_table: this is one long string with all of the tokens concatenated + * together, e.g. "a\0b\0c\0...z\0write_\0read_\0..." + * - token_index: this is a 256-entry long array containing the index into + * token_table where you'll find that token's string. + * + * To decode a string, for each byte you simply index into token_index, then use + * that to index into token_table, and copy that string into your buffer. + * + * The actual kallsyms symbol names are concatenated into a buffer called + * "names". The first byte in a name is the length (in tokens, not decoded + * bytes) of the symbol name. The remaining "length" bytes are decoded via the + * table as described above. The first decoded byte is a character representing + * what type of symbol this is (e.g. text, data structure, etc). + */ +struct kallsyms_reader { + uint32_t num_syms; + uint8_t *names; + char *token_table; + uint16_t *token_index; + bool long_names; +}; + +/* + * We determine symbol length by the start of the subsequent symbol. + * Unfortunately, there can be large gaps in the symbol table, for instance the + * Linux kernel has percpu symbols near the beginning of the address space, and + * a large gap before normal kernel symbols. The result of this is that we can + * create symbols with incredibly large sizes, and then drgn's symbolization + * will print addresses using that symbol and a very large offset, which is + * absolutely meaningless. + * + * To avoid this, we set a cap on the length of a symbol. Unfortunately, this is + * a heuristic. It's entirely possible to have very large data symbols. This + * value is chosen somewhat arbitrarily, but seems to produce decent results. + */ +#define MAX_SYMBOL_LENGTH 0x10000 + +/* + * Since 73bbb94466fd3 ("kallsyms: support "big" kernel symbols"), the + * "kallsyms_names" array may use the most significant bit to indicate that the + * initial element for each symbol (normally representing the number of tokens + * in the symbol) requires two bytes. + * + * Unfortunately, that means that values 128-255 are now ambiguous: on older + * kernels, they should be interpreted literally, but on newer kernels, they + * require treating as a two byte sequence. Since the commit included no changes + * to the symbol names or vmcoreinfo, there's no way to detect it except via + * heuristics. + * + * The commit in question is a new feature and not likely to be backported to + * stable, so our heuristic is that it was first included in kernel 6.1. + * However, we first check the environment variable DRGN_KALLSYMS_LONG: if it + * exists, then we use its first character to determine our behavior: 1, y, Y + * all indicate that we should use long names. 0, n, N all indicate that we + * should not. + */ +static bool guess_long_names(struct drgn_program *prog) +{ + const char *env = getenv("DRGN_KALLSYMS_LONG"); + const char *osrelease; + int i; + int major = 0, minor = 0; + + if (env) { + if (*env == '1' || *env == 'y' || *env == 'Y') + return true; + else if (*env == '0' || *env == 'n' || *env == 'N') + return false; + } + + osrelease = prog->vmcoreinfo.osrelease; + for (i = 0; i < sizeof(prog->vmcoreinfo.osrelease) && osrelease[i]; i++) { + char c = osrelease[i]; + if (c < '0' || c > '9') + break; + major *= 10; + major += osrelease[i] - '0'; + } + for (i = i + 1; i < sizeof(prog->vmcoreinfo.osrelease) && osrelease[i] && osrelease[i] != '.'; i++) { + char c = osrelease[i]; + if (c < '0' || c > '9') + break; + minor *= 10; + minor += osrelease[i] - '0'; + } + return (major == 6 && minor >= 1) || major > 6; +} + +/** + * Copy the kallsyms names tables from the program into host memory. + * @param prog Program to read from + * @param kr kallsyms_reader to populate + * @param vi vmcoreinfo for the program + */ +static struct drgn_error * +kallsyms_copy_tables(struct drgn_program *prog, struct kallsyms_reader *kr, + struct kallsyms_locations *loc) +{ + struct drgn_error *err; + const size_t token_index_size = (UINT8_MAX + 1) * sizeof(uint16_t); + uint64_t last_token; + size_t token_table_size, names_idx; + char data; + uint8_t len_u8; + int len; + bool bswap; + + err = drgn_program_bswap(prog, &bswap); + if (err) + return err; + + /* Read num_syms from vmcore */ + err = drgn_program_read_u32(prog, + loc->kallsyms_num_syms, + false, &kr->num_syms); + if (err) + return err; + if (bswap) + kr->num_syms = bswap_32(kr->num_syms); + + /* Read the constant-sized token_index table (256 entries) */ + kr->token_index = malloc(token_index_size); + if (!kr->token_index) + return &drgn_enomem; + err = drgn_program_read_memory(prog, kr->token_index, + loc->kallsyms_token_index, + token_index_size, false); + if (err) + return err; + if (bswap) { + for (size_t i = 0; i < kr->num_syms; i++) { + kr->token_index[i] = bswap_16(kr->token_index[i]); + } + } + + /* + * Find the end of the last token, so we get the overall length of + * token_table. Then copy the token_table into host memory. + */ + last_token = loc->kallsyms_token_table + kr->token_index[UINT8_MAX]; + do { + err = drgn_program_read_u8(prog, last_token, false, + (uint8_t *)&data); + if (err) + return err; + + last_token++; + } while (data); + token_table_size = last_token - loc->kallsyms_token_table + 1; + kr->token_table = malloc(token_table_size); + if (!kr->token_table) + return &drgn_enomem; + err = drgn_program_read_memory(prog, kr->token_table, + loc->kallsyms_token_table, + token_table_size, false); + if (err) + return err; + + /* Now find the end of the names array by skipping through it, then copy + * that into host memory. */ + names_idx = 0; + kr->long_names = guess_long_names(prog); + for (size_t i = 0; i < kr->num_syms; i++) { + err = drgn_program_read_u8(prog, + loc->kallsyms_names + names_idx, + false, &len_u8); + if (err) + return err; + len = len_u8; + if ((len & 0x80) && kr->long_names) { + err = drgn_program_read_u8(prog, + loc->kallsyms_names + names_idx + 1, + false, &len_u8); + if (err) + return err; + len = (len & 0x7F) | (len_u8 << 7); + names_idx++; + } + names_idx += len + 1; + } + kr->names = malloc(names_idx); + if (!kr->names) + return &drgn_enomem; + err = drgn_program_read_memory(prog, kr->names, + loc->kallsyms_names, + names_idx, false); + if (err) + return err; + + return NULL; +} + +/** + * Write the symbol starting at @a offset into @a result. + * @param kr Registry containing kallsyms data + * @param offset Starting index within "names" array for this symbol + * @param result Buffer to write output symbol to + * @param maxlen Size of output buffer, to avoid overruns + * @param[out] kind_ret Where to write the symbol kind data + * @param[out] bytes_ret How many bytes were output (incl. NUL) + * @returns The offset of the next symbol + */ +static unsigned int +kallsyms_expand_symbol(struct kallsyms_reader *kr, unsigned int offset, + char *result, size_t maxlen, char *kind_ret, + size_t *bytes_ret) +{ + uint8_t *data = &kr->names[offset]; + unsigned int len = *data; + bool skipped_first = false; + size_t bytes = 0; + + if ((len & 0x80) && kr->long_names) { + data++; + offset++; + len = (0x7F & len) | (*data << 7); + } + + offset += len + 1; + data += 1; + while (len) { + char *token_ptr = &kr->token_table[kr->token_index[*data]]; + while (*token_ptr) { + if (skipped_first) { + if (maxlen <= 1) + goto tail; + *result = *token_ptr; + result++; + maxlen--; + bytes++; + } else { + if (kind_ret) + *kind_ret = *token_ptr; + skipped_first = true; + } + token_ptr++; + } + + data++; + len--; + } + +tail: + *result = '\0'; + bytes++; + *bytes_ret = bytes; + return offset; +} + +/** Decode all symbol names from @a kr and place them into @a reg */ +static struct drgn_error * +kallsyms_create_symbol_array(struct kallsyms_finder *reg, struct kallsyms_reader *kr) +{ + uint8_t token_lengths[UINT8_MAX+1]; + + /* Compute the length of each token */ + for (int i = 0; i <= UINT8_MAX; i++) { + token_lengths[i] = strlen(&kr->token_table[kr->token_index[i]]); + } + + /* Now compute the length of all symbols together */ + size_t names_idx = 0; + size_t length = 0; + for (int i = 0; i < kr->num_syms; i++) { + unsigned int num_tokens = kr->names[names_idx]; + if ((num_tokens & 0x80) && kr->long_names) + num_tokens = (num_tokens & 0x7F) | (kr->names[++names_idx] << 7); + for (int j = names_idx + 1; j < names_idx + num_tokens + 1; j++) + length += token_lengths[kr->names[j]]; + length++; /* nul terminator */ + names_idx += num_tokens + 1; + } + + /* We use uint32_t to index into the array of strings. That allows for + * 4GiB of names which should be plenty, but still: check for overflow. */ + if (length >= UINT32_MAX) + return drgn_error_format(DRGN_ERROR_OUT_OF_BOUNDS, + "kallsyms string table is too large: %lu", + length); + + reg->strings = malloc(length); + reg->strings_len = length; + reg->names = calloc(kr->num_syms, sizeof(*reg->names)); + reg->types = malloc(kr->num_syms); + reg->num_syms = kr->num_syms; + if (!reg->strings || !reg->names || !reg->types) + return &drgn_enomem; + + names_idx = 0; + uint32_t symbols_idx = 0; + for (int i = 0; i < kr->num_syms; i++) { + size_t bytes = 0; + names_idx = kallsyms_expand_symbol(kr, names_idx, + reg->strings + symbols_idx, + length - symbols_idx, ®->types[i], + &bytes); + reg->names[i] = symbols_idx; + symbols_idx += (uint32_t) bytes; + } + return NULL; +} + +static int kallsyms_name_compar(const void *lhs, const void *rhs, void *arg) +{ + struct kallsyms_finder *kr = arg; + uint32_t left_ix = *(const uint32_t *)lhs; + uint32_t right_ix = *(const uint32_t *)rhs; + return strcmp(&kr->strings[kr->names[left_ix]], + &kr->strings[kr->names[right_ix]]); +} + +static struct drgn_error * +kallsyms_create_htab(struct kallsyms_finder *kr) +{ + /* + * A sorted list of symbol indices. Entries of the hash table will point + * into this list for a certain number of elements. + */ + kr->sorted = malloc(kr->num_syms * sizeof(kr->sorted[0])); + for (uint32_t i = 0; i < kr->num_syms; i++) + kr->sorted[i] = i; + + qsort_r(kr->sorted, kr->num_syms, sizeof(kr->sorted[0]), + kallsyms_name_compar, kr); + + if (!drgn_kallsyms_names_reserve(&kr->htab, kr->num_syms)) + return &drgn_enomem; + + /* For each unique symbol name, insert the index, and number of + * occurrences into the hash table. */ + struct drgn_kallsyms_names_entry entry; + uint32_t current = 0; + while (current < kr->num_syms) { + char *current_str = &kr->strings[kr->names[kr->sorted[current]]]; + uint32_t next = current + 1; + while (next < kr->num_syms) { + char *next_str = &kr->strings[kr->names[kr->sorted[next]]]; + if (strcmp(current_str, next_str) != 0) + break; + next++; + } + + entry.key = current_str; + entry.value.start = current; + entry.value.end = next; + drgn_kallsyms_names_insert(&kr->htab, &entry, NULL); + current = next; + } + return NULL; +} + +/** Copies and decodes symbol names from the program. */ +static struct drgn_error * +kallsyms_load_names(struct kallsyms_finder *reg, struct kallsyms_locations *loc) +{ + struct drgn_error *err; + struct kallsyms_reader reader = {0}; + + err = kallsyms_copy_tables(reg->prog, &reader, loc); + if (err) + goto out; + + err = kallsyms_create_symbol_array(reg, &reader); +out: + free(reader.names); + free(reader.token_index); + free(reader.token_table); + return err; +} + +/** Lookup @a name in the registry @a kr, and return the index of the symbol */ +static int drgn_kallsyms_lookup(struct kallsyms_finder *kr, const char *name) +{ + struct drgn_kallsyms_names_iterator it = + drgn_kallsyms_names_search(&kr->htab, (char **)&name); + if (it.entry) { + return kr->sorted[it.entry->value.start]; + } + return -1; +} + +/** Return the address of symbol at @a index*/ +static uint64_t +kallsyms_address(struct kallsyms_finder *kr, unsigned int index) +{ + return kr->addresses[index]; +} + +static void drgn_symbol_from_kallsyms(struct kallsyms_finder *kr, int index, + struct drgn_symbol *ret) +{ + char kind = kr->types[index]; + char kind_lower = tolower(kind); + ret->name = &kr->strings[kr->names[index]]; + ret->address = kallsyms_address(kr, index); + if (index < kr->num_syms) { + size_t size = kallsyms_address(kr, index + 1) - ret->address; + if (size < MAX_SYMBOL_LENGTH) + ret->size = size; + else + ret->size = 0; + } else { + ret->size = 0; + } + + ret->binding = DRGN_SYMBOL_BINDING_GLOBAL; + if (kind == 'u') + ret->binding = DRGN_SYMBOL_BINDING_UNIQUE; + else if (kind_lower == 'v' || kind_lower == 'w') + ret->binding = DRGN_SYMBOL_BINDING_WEAK; + else if (isupper(kind)) + ret->binding = DRGN_SYMBOL_BINDING_GLOBAL; + else + /* If lowercase, the symbol is usually local, but it's + * not guaranteed. Use unknown for safety here. */ + ret->binding = DRGN_SYMBOL_BINDING_UNKNOWN; + + switch (kind_lower) { + case 'b': /* bss */ + case 'c': /* uninitialized data */ + case 'd': /* initialized data */ + case 'g': /* initialized data (small objects) */ + case 'r': /* read-only data */ + ret->kind = DRGN_SYMBOL_KIND_OBJECT; + break; + case 't': /* text */ + ret->kind = DRGN_SYMBOL_KIND_FUNC; + break; + default: + ret->kind = DRGN_SYMBOL_KIND_UNKNOWN; + } + /* NOTE: The name field is owned by the kallsyms finder. + * Once the kallsyms finder is bound to the program, it cannot be + * unbound, and so it shares lifetime with the Program. + */ + ret->name_lifetime = DRGN_LIFETIME_STATIC; +} + +static int kallsyms_addr_compar(const void *key_void, const void *memb_void) +{ + const uint64_t *key = key_void; + const uint64_t *memb = memb_void; + + /* We are guaranteed that: (min <= key <= max), so we can fearlessly + * index one beyond memb, so long as we've checked that key > memb. + */ + if (*key == *memb) + return 0; + else if (*key < *memb) + return -1; + else if (*key < memb[1]) + return 0; + else + return 1; +} + +static inline struct drgn_error * +add_result(struct kallsyms_finder *kr, struct drgn_symbol_result_builder *builder, int index) +{ + struct drgn_symbol *symbol = malloc(sizeof(*symbol)); + if (!symbol) + return &drgn_enomem; + drgn_symbol_from_kallsyms(kr, index, symbol); + if (drgn_symbol_result_builder_add(builder, symbol)) { + return NULL; + } else { + free(symbol); + return &drgn_enomem; + } +} + +struct drgn_error * +drgn_kallsyms_symbol_finder(const char *name, uint64_t address, + enum drgn_find_symbol_flags flags, void *arg, + struct drgn_symbol_result_builder *builder) +{ + struct kallsyms_finder *kr = arg; + uint64_t begin = kallsyms_address(kr, 0); + uint64_t end = kallsyms_address(kr, kr->num_syms - 1); + struct drgn_error *err = NULL; + + /* We assume the last symbol is "zero length" for simplicity. + * Short-circuit the search when we're searching outside the address + * range. + */ + if (flags & DRGN_FIND_SYMBOL_ADDR) { + uint64_t *res; + if (address < begin || address > end) + return NULL; + res = bsearch(&address, kr->addresses, kr->num_syms, sizeof(address), + kallsyms_addr_compar); + /* If the gap between symbols > MAX_SYMBOL_LENGTH, then we infer that + * the symbol doesn't contain the address, so fail. */ + if (!res || res[1] - res[0] > MAX_SYMBOL_LENGTH) + return NULL; + return add_result(kr, builder, res - kr->addresses); + } else if (flags & DRGN_FIND_SYMBOL_NAME) { + struct drgn_kallsyms_names_iterator it = + drgn_kallsyms_names_search(&kr->htab, (char **)&name); + if (!it.entry) + return NULL; + for (uint32_t i = it.entry->value.start; i < it.entry->value.end; i++) { + err = add_result(kr, builder, kr->sorted[i]); + it = drgn_kallsyms_names_next(it); + if (err || flags & DRGN_FIND_SYMBOL_ONE) + break; + } + return err; + } else { + for (int i = 0; i < kr->num_syms; i++) + if ((err = add_result(kr, builder, i)) + || (flags & DRGN_FIND_SYMBOL_ONE)) + return err; + } + return NULL; +} + +/** Compute an address via the CONFIG_KALLSYMS_ABSOLUTE_PERCPU method*/ +static uint64_t absolute_percpu(uint64_t base, int32_t val) +{ + if (val >= 0) + return (uint64_t) val; + else + return base - 1 - val; +} + +/** + * Load the kallsyms address information from @a prog + * + * Just as symbol name loading is complex, so is address loading. Addresses may + * be stored directly as an array of pointers, but more commonly, they are + * stored as an array of 32-bit integers which are related to an offset. This + * function decodes the addresses into a plain array of 64-bit addresses. + * + * @param prog The program to read from + * @param kr The symbol registry to fill + * @param vi vmcoreinfo containing necessary symbols + * @returns NULL on success, or error + */ +static struct drgn_error * +kallsyms_load_addresses(struct drgn_program *prog, struct kallsyms_finder *kr, + struct kallsyms_locations *loc) +{ + struct drgn_error *err = NULL; + bool bswap, bits64; + uint32_t *addr32; + + err = drgn_program_bswap(prog, &bswap); + if (err) + return err; + err = drgn_program_is_64_bit(prog, &bits64); + if (err) + return err; + + kr->addresses = malloc(kr->num_syms * sizeof(uint64_t)); + if (!kr->addresses) + return &drgn_enomem; + + if (loc->kallsyms_addresses) { + /* + * The kallsyms addresses are stored as plain addresses in an + * array of unsigned long! Read the appropriate size array and + * do any necessary byte swaps. + */ + if (!bits64) { + addr32 = malloc(kr->num_syms * sizeof(uint32_t)); + if (!addr32) + return &drgn_enomem; + + err = drgn_program_read_memory(prog, addr32, + loc->kallsyms_addresses, + kr->num_syms * sizeof(uint32_t), + false); + if (err) { + free(addr32); + return err; + } + for (int i = 0; i < kr->num_syms; i++) { + if (bswap) + kr->addresses[i] = bswap_32(addr32[i]); + else + kr->addresses[i] = addr32[i]; + } + free(addr32); + } else { + err = drgn_program_read_memory(prog, kr->addresses, + loc->kallsyms_addresses, + kr->num_syms * sizeof(uint32_t), + false); + if (err) + return err; + if (bswap) + for (int i = 0; i < kr->num_syms; i++) + kr->addresses[i] = bswap_64(kr->addresses[i]); + } + } else { + /* + * The kallsyms addresses are stored in an array of 4-byte + * values, which can be interpreted in two ways: + * (1) if CONFIG_KALLSYMS_ABSOLUTE_PERCPU is enabled, then + * positive values are addresses, and negative values are + * offsets from a base address. + * (2) otherwise, the 4-byte values are directly used as + * addresses + * First, read the values, then figure out which way to + * interpret them. + */ + uint64_t relative_base; + if (bits64) { + err = drgn_program_read_u64(prog, loc->kallsyms_relative_base, + false, &relative_base); + if (err) + return err; + if (bswap) + relative_base = bswap_64(relative_base); + } else { + uint32_t rel32; + err = drgn_program_read_u32(prog, loc->kallsyms_relative_base, + false, &rel32); + if (err) + return err; + if (bswap) + rel32 = bswap_32(rel32); + relative_base = rel32; + } + addr32 = malloc(kr->num_syms * sizeof(uint32_t)); + if (!addr32) + return &drgn_enomem; + + err = drgn_program_read_memory(prog, addr32, + loc->kallsyms_offsets, + kr->num_syms * sizeof(uint32_t), + false); + if (err) { + free(addr32); + return err; + } + if (bswap) + for (int i = 0; i < kr->num_syms; i++) + addr32[i] = bswap_32(addr32[i]); + + /* + * Now that we've read the offsets data, we need to determine + * how to interpret them. To do this, use the _stext symbol. We + * have the correct value from vmcoreinfo. Compute it both ways + * and pick the correct interpretation. + */ + int stext_idx = drgn_kallsyms_lookup(kr,"_stext"); + if (stext_idx < 0) { + free(addr32); + return drgn_error_create( + DRGN_ERROR_OTHER, + "Could not find _stext symbol in kallsyms"); + } + + uint64_t stext_abs = relative_base + addr32[stext_idx]; + uint64_t stext_pcpu = absolute_percpu(relative_base, (int32_t)addr32[stext_idx]); + if (stext_abs == loc->_stext) { + for (int i = 0; i < kr->num_syms; i++) + kr->addresses[i] = relative_base + addr32[i]; + } else if (stext_pcpu == loc->_stext) { + for (int i = 0; i < kr->num_syms; i++) + kr->addresses[i] = absolute_percpu(relative_base, (int32_t)addr32[i]); + } else { + err = drgn_error_create( + DRGN_ERROR_OTHER, + "Unable to interpret kallsyms address data"); + } + free(addr32); + } + return err; +} + +/** Free all data held by @a kr */ +void drgn_kallsyms_destroy(struct kallsyms_finder *kr) +{ + if (kr) { + drgn_kallsyms_names_deinit(&kr->htab); + free(kr->sorted); + free(kr->addresses); + free(kr->strings); + free(kr->names); + free(kr->types); + } +} + +/** Load kallsyms data from vmcore + vmcoreinfo data */ +static struct drgn_error * +drgn_kallsyms_from_vmcore(struct kallsyms_finder *kr, struct drgn_program *prog, + struct kallsyms_locations *loc) +{ + struct drgn_error *err; + + memset(kr, 0, sizeof(*kr)); + kr->prog = prog; + drgn_kallsyms_names_init(&kr->htab); + + err = kallsyms_load_names(kr, loc); + if (err) + goto out; + + err = kallsyms_create_htab(kr); + if (err) + goto out; + + err = kallsyms_load_addresses(prog, kr, loc); + if (err) + goto out; + + return NULL; + +out: + drgn_kallsyms_destroy(kr); + return err; +} + +struct allocated { + uint32_t symbols; + size_t symbol_buffer; +}; + +/** Append a symbol onto the kallsyms finder, expanding the allocations if needed. */ +static struct drgn_error * +kallsyms_append(struct kallsyms_finder *kr, struct allocated *a, const char *name, uint64_t address, char type) +{ + size_t name_len = strlen(name) + 1; + if (kr->num_syms == a->symbols) { + a->symbols = a->symbols ? a->symbols * 2 : 1024; + kr->names = realloc(kr->names, a->symbols * sizeof(kr->names[0])); + kr->addresses = realloc(kr->addresses, a->symbols * sizeof(kr->addresses[0])); + kr->types = realloc(kr->types, a->symbols); + if (!kr->names || !kr->addresses || !kr->types) + return &drgn_enomem; + } + + while (kr->strings_len + name_len > a->symbol_buffer) { + a->symbol_buffer = a->symbol_buffer ? a->symbol_buffer * 2 : 4096; + kr->strings = realloc(kr->strings, a->symbol_buffer); + if (!kr->strings) + return &drgn_enomem; + } + memcpy(&kr->strings[kr->strings_len], name, name_len); + /* + * We can't just store the pointer, since symbol_buffer may move during + * reallocation. Store the index of the string in the buffer, and when + * we finalize everything, we will fix it up. + */ + kr->names[kr->num_syms] = kr->strings_len; + kr->addresses[kr->num_syms] = address; + kr->types[kr->num_syms] = type; + kr->num_syms++; + kr->strings_len += name_len; + return NULL; +} + +/** Reallocate buffers to fit contents, and fixup the symbol array */ +static struct drgn_error * +kallsyms_finalize(struct kallsyms_finder *kr) +{ + kr->names = realloc(kr->names, kr->num_syms * sizeof(kr->names[0])); + kr->addresses = realloc(kr->addresses, kr->num_syms * sizeof(kr->addresses[0])); + kr->types = realloc(kr->types, kr->num_syms * sizeof(kr->types[0])); + kr->strings = realloc(kr->strings, kr->strings_len); + if (!kr->names || !kr->addresses || !kr->types || !kr->strings) + return &drgn_enomem; + return NULL; +} + +/** Load kallsyms directly from the /proc/kallsyms file */ +static struct drgn_error *drgn_kallsyms_from_proc(struct kallsyms_finder *kr, + struct drgn_program *prog) +{ + char *line = NULL; + size_t line_size = 0; + ssize_t res; + size_t line_number = 1; + struct allocated allocated = {0}; + struct drgn_error *err = NULL; + FILE *fp = fopen("/proc/kallsyms", "r"); + if (!fp) + return drgn_error_create_os("Error opening kallsyms", errno, "/proc/kallsyms"); + + memset(kr, 0, sizeof(*kr)); + kr->prog = prog; + drgn_kallsyms_names_init(&kr->htab); + + while ((res = getline(&line, &line_size, fp)) != -1) { + char *save = NULL; + char *name, *addr_str, *type_str, *mod, *addr_rem; + char type; + uint64_t addr; + + addr_str = strtok_r(line, " \t\r\n", &save); + type_str = strtok_r(NULL," \t\r\n", &save); + name = strtok_r(NULL," \t\r\n", &save); + mod = strtok_r(NULL," \t\r\n", &save); + + if (!addr_str || !type_str || !name) { + err = drgn_error_format(DRGN_ERROR_SYNTAX, "Error parsing kallsyms line %zu", line_number); + break; + } + if (mod) + break; + type = *type_str; + addr = strtoull(addr_str, &addr_rem, 16); + if (*addr_rem) { + /* addr_rem should be set to the first un-parsed character, and + * since the entire string should be a valid base 16 integer, + * we expect it to be \0 */ + err = drgn_error_format(DRGN_ERROR_SYNTAX, + "Invalid address \"%s\" in kallsyms line %zu", + addr_str, line_number); + break; + } + err = kallsyms_append(kr, &allocated, name, addr, type); + if (err) + break; + line_number++; + } + + if (!err && ferror(fp)) + err = drgn_error_create_os("Error reading kallsyms", errno, "/proc/kallsyms"); + else + err = kallsyms_finalize(kr); + if (!err) + err = kallsyms_create_htab(kr); + fclose(fp); + free(line); + if (err) + drgn_kallsyms_destroy(kr); + return err; +} + +struct drgn_error *drgn_kallsyms_init(struct kallsyms_finder *kr, + struct drgn_program *prog, + struct kallsyms_locations *loc) +{ + /* + * There are two ways to parse kallsyms data: by using /proc/kallsyms, + * or by finding the necessary symbols in the vmcoreinfo and using them + * to read out the kallsyms data from the vmcore. + * + * Reading /proc/kallsyms is more straightforward, performant, and it + * has broader kernel version support: it should be preferred for live + * systems. + * + * Parsing kallsyms from a core dump is more involved, and it requires + * that the kernel publish some symbol addresses in the VMCOREINFO note. + * The following kernel commits are required, and were introduced in + * 6.0: + * + * - 5fd8fea935a10 ("vmcoreinfo: include kallsyms symbols") + * - f09bddbd86619 ("vmcoreinfo: add kallsyms_num_syms symbol") + */ + if (prog->flags & DRGN_PROGRAM_IS_LIVE) + return drgn_kallsyms_from_proc(kr, prog); + else if (loc->kallsyms_names && loc->kallsyms_token_table + && loc->kallsyms_token_index && loc->kallsyms_num_syms) + return drgn_kallsyms_from_vmcore(kr, prog, loc); + else + return drgn_error_create( + DRGN_ERROR_MISSING_DEBUG_INFO, + "The symbols: kallsyms_names, kallsyms_token_table, " + "kallsyms_token_index, and kallsyms_num_syms were not " + "found in VMCOREINFO, and the program is not live, " + "so /proc/kallsyms cannot be used. There is not enough " + "information to use the kallsyms symbol finder." + ); +} diff --git a/libdrgn/kallsyms.h b/libdrgn/kallsyms.h new file mode 100644 index 000000000..e8c9a710a --- /dev/null +++ b/libdrgn/kallsyms.h @@ -0,0 +1,134 @@ +// Copyright (c) 2023 Oracle and/or its affiliates +// SPDX-License-Identifier: LGPL-2.1-or-later + +/** + * @file + * + * Kallsyms data handling + * + * See @ref Kallsyms + */ + +#ifndef DRGN_KALLSYMS_H +#define DRGN_KALLSYMS_H + +#include +#include + +#include "hash_table.h" + +struct drgn_program; +struct drgn_module; +struct vmcoreinfo; +enum drgn_find_symbol_flags; +struct drgn_symbol_result_builder; + +struct kallsyms_locations { + uint64_t kallsyms_names; + uint64_t kallsyms_token_table; + uint64_t kallsyms_token_index; + uint64_t kallsyms_num_syms; + uint64_t kallsyms_offsets; + uint64_t kallsyms_relative_base; + uint64_t kallsyms_addresses; + uint64_t _stext; +}; + +/** + * @ingroup KernelInfo + * + * @defgroup Kallsyms Kallsyms symbol table + * + * Using the kallsyms data from within the program as a symbol table. + * + * @{ + */ + +struct symbol_entry { + uint32_t start; + uint32_t end; +}; + +DEFINE_HASH_MAP(drgn_kallsyms_names, char *, struct symbol_entry, + c_string_key_hash_pair, c_string_key_eq); + +/** + * Holds kallsyms data copied from the kernel + * + * Kallsyms data are in increasing sorted order by address. Each symbol is + * identified by its index, which we can assume fits in a uint32_t. The + * essential data is stored in arrays of length "num_syms": the memory address, + * the symbol type, and the index into the string table. + * + * Strings are stored in a single buffer, all concatenated together and + * separated by nul bytes. + */ +struct kallsyms_finder { + /** Program owning this registry */ + struct drgn_program *prog; + + /** Number of symbols */ + uint32_t num_syms; + /** Array of symbol addresses */ + uint64_t *addresses; + /** Array of one-character type codes*/ + char *types; + /** Array of symbol names */ + uint32_t *names; + + /** Buffer backing the symbols array, all point into here */ + char *strings; + /** Bytes used of symbol buffer array */ + uint32_t strings_len; + + /** Array of symbol indices, sorted by name. Used by the htab. */ + uint32_t *sorted; + /** Map of symbol names to index */ + struct drgn_kallsyms_names htab; +}; + + +/** + * Initialize kallsyms data + * + * Search for a kallsyms symbol table, and if found, attempt to load it. On + * success, a kallsyms registry is returned in @a ret. If the kallsyms data is + * not found (a common failure mode), NULL will be returned to indicate no + * error, but @a ret will not be set. This indicates that initialization should + * continue. If an error occurs parsing the kallsyms data once it is found, the + * error will be returned. + * + * @param prog Program to search + * @param vi vmcoreinfo from the crash dump + * @param[out] ret Created registry + * @returns NULL on success, or when kallsyms data is not found + */ +struct drgn_error *drgn_kallsyms_init(struct kallsyms_finder *reg, + struct drgn_program *prog, + struct kallsyms_locations *locations); + +/** + * Find a symbol using the symbol finder object + * + * This object may be passed to drgn_program_add_symbol_finder, along with a + * pointer to the struct kallsyms_finder, in order to find symbols in the + * vmlinux kallsyms. + */ +struct drgn_error * +drgn_kallsyms_symbol_finder(const char *name, uint64_t address, + enum drgn_find_symbol_flags flags, void *arg, + struct drgn_symbol_result_builder *builder); + +/** + * Destroy kallsyms data + * + * Frees all resources held by the kallsyms finder. Please note that if the + * finder has been added to the program, then this *will* cause errors. + * + * @param kr Finder to destroy + */ +void drgn_kallsyms_destroy(struct kallsyms_finder *kr); + +/** @} */ + +#endif // DRGN_KALLSYMS_H diff --git a/libdrgn/python/drgnpy.h b/libdrgn/python/drgnpy.h index 56608e575..b44f9a26f 100644 --- a/libdrgn/python/drgnpy.h +++ b/libdrgn/python/drgnpy.h @@ -109,6 +109,11 @@ typedef struct { PyObject *attr_cache; } DrgnType; +typedef struct { + PyObject_HEAD + struct kallsyms_finder *finder; +} KallsymsFinder; + typedef struct { PyObject_HEAD /* @@ -225,6 +230,7 @@ extern PyObject *TypeKind_class; extern PyTypeObject DrgnObject_type; extern PyTypeObject DrgnType_type; extern PyTypeObject FaultError_type; +extern PyTypeObject KallsymsFinder_type; extern PyTypeObject Language_type; extern PyTypeObject ObjectIterator_type; extern PyTypeObject Platform_type; diff --git a/libdrgn/python/kallsyms_finder.c b/libdrgn/python/kallsyms_finder.c new file mode 100644 index 000000000..35a32dbad --- /dev/null +++ b/libdrgn/python/kallsyms_finder.c @@ -0,0 +1,147 @@ +// Copyright (c) 2023 Oracle and/or its affiliates +// SPDX-License-Identifier: LGPL-2.1-or-later + +#include "drgn.h" +#include "drgnpy.h" +#include "kallsyms.h" +#include "modsupport.h" +#include "pyerrors.h" +#include "symbol.h" + +static void KallsymsFinder_dealloc(KallsymsFinder *self) +{ + /* This can't be called if the finder has been added to the program. The + * program should take a reference and prevent deallocation. */ + drgn_kallsyms_destroy(self->finder); + free(self->finder); + Py_TYPE(self)->tp_free((PyObject *)self); +} + + +static PyObject *KallsymsFinder_repr(KallsymsFinder *self) +{ + return (PyObject *)PyUnicode_FromString("KallsymsFinder()"); +} + +static PyObject *KallsymsFinder_call(KallsymsFinder *self, PyObject *args, PyObject *kwargs) +{ + PyObject *address_obj, *name_obj; + uint64_t address = 0; + const char *name = NULL; + static char *kwnames[] = {"name", "address", "one", NULL}; + unsigned int flags = 0; + bool single; + struct drgn_symbol_result_builder builder; + struct drgn_error *err; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOp:__call__", kwnames, + &name_obj, &address_obj, &single)) + return NULL; + + flags |= single ? DRGN_FIND_SYMBOL_ONE : 0; + + if (address_obj != Py_None) { + if (!PyLong_Check(address_obj)) { + PyErr_SetString(PyExc_TypeError, "address: an integer is required"); + return NULL; + } + flags |= DRGN_FIND_SYMBOL_ADDR; + address = PyLong_AsUint64(address_obj); + /* Overflow check */ + if (PyErr_Occurred()) + return NULL; + } + if (name_obj != Py_None) { + if (!PyUnicode_Check(name_obj)) { + PyErr_SetString(PyExc_TypeError, "name: a string is required"); + return NULL; + } + flags |= DRGN_FIND_SYMBOL_NAME; + name = PyUnicode_AsUTF8(name_obj); + } + + drgn_symbol_result_builder_init(&builder, flags & DRGN_FIND_SYMBOL_ONE); + + err = drgn_kallsyms_symbol_finder(name, address, flags, self->finder, &builder); + if (err) + goto error; + + /* We return a list regardless */ + if (single) { + _cleanup_pydecref_ PyObject *list = PyList_New(1); + if (!list) + goto error; + struct drgn_symbol* symbol = drgn_symbol_result_builder_single(&builder); + PyObject *prog_obj = (PyObject *)container_of(self->finder->prog, Program, prog); + PyObject *pysym = Symbol_wrap(symbol, prog_obj); + if (!pysym) + goto error; + PyList_SET_ITEM(list, 0, pysym); + return_ptr(list); + } else { + struct drgn_symbol **syms; + size_t count; + drgn_symbol_result_builder_array(&builder, &syms, &count); + return Symbol_list_wrap(syms, count, + container_of(self->finder->prog, Program, prog)); + } + + return NULL; +error: + drgn_symbol_result_builder_abort(&builder); + return err ? set_drgn_error(err) : NULL; +} + +static PyObject *KallsymsFinder_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) +{ + static char *kwnames[] = {"prog", "names", "token_table", "token_index", "num_syms", + "offsets", "relative_base", "addresses", "_stext", NULL}; + struct kallsyms_locations kl; + PyObject *prog_obj; + struct drgn_program *prog; + struct drgn_error *err; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OKKKKKKKK", kwnames, + &prog_obj, &kl.kallsyms_names, &kl.kallsyms_token_table, + &kl.kallsyms_token_index, &kl.kallsyms_num_syms, + &kl.kallsyms_offsets, &kl.kallsyms_relative_base, + &kl.kallsyms_addresses, &kl._stext)) + return NULL; + + if (!PyObject_TypeCheck(prog_obj, &Program_type)) + return PyErr_Format(PyExc_TypeError, "expected Program, not %s", + Py_TYPE(prog_obj)->tp_name); + + prog = &((Program *)prog_obj)->prog; + + struct kallsyms_finder *finder = calloc(1, sizeof(*finder)); + if (!finder) + return set_drgn_error(&drgn_enomem); + err = drgn_kallsyms_init(finder, prog, &kl); + if (err) + goto out; + + KallsymsFinder *finder_obj = call_tp_alloc(KallsymsFinder); + if (!finder_obj) { + drgn_kallsyms_destroy(finder); + goto out; + } + finder_obj->finder = finder; + Py_INCREF(prog_obj); + return (PyObject *)finder_obj; +out: + free(finder); + return set_drgn_error(err); +} + +PyTypeObject KallsymsFinder_type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "_drgn.KallsymsFinder", + .tp_basicsize = sizeof(KallsymsFinder), + .tp_dealloc = (destructor)KallsymsFinder_dealloc, + .tp_repr = (reprfunc)KallsymsFinder_repr, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_doc = drgn_KallsymsFinder_DOC, + .tp_call = (ternaryfunc)KallsymsFinder_call, + .tp_new = KallsymsFinder_new, +}; diff --git a/libdrgn/python/main.c b/libdrgn/python/main.c index 2bc92517d..180e43055 100644 --- a/libdrgn/python/main.c +++ b/libdrgn/python/main.c @@ -264,6 +264,7 @@ DRGNPY_PUBLIC PyMODINIT_FUNC PyInit__drgn(void) }) if (add_module_constants(m) || + add_type(m, &KallsymsFinder_type) || add_type(m, &Language_type) || add_languages() || add_type(m, &DrgnObject_type) || PyType_Ready(&ObjectIterator_type) || diff --git a/libdrgn/python/program.c b/libdrgn/python/program.c index c3b11ab2d..0008acd1b 100644 --- a/libdrgn/python/program.c +++ b/libdrgn/python/program.c @@ -10,6 +10,7 @@ #include "../string_builder.h" #include "../util.h" #include "../vector.h" +#include "kallsyms.h" DEFINE_HASH_SET_FUNCTIONS(pyobjectp_set, ptr_key_hash_pair, scalar_key_eq); @@ -591,8 +592,15 @@ static PyObject *Program_add_symbol_finder(Program *self, PyObject *args, if (ret == -1) return NULL; - err = drgn_program_add_symbol_finder(&self->prog, py_symbol_find_fn, - fn); + /* Fast path for the builtin kallsyms finder, avoidng Python object + * allocation overhead */ + if (PyObject_TypeCheck(fn, &KallsymsFinder_type)) + err = drgn_program_add_symbol_finder(&self->prog, + drgn_kallsyms_symbol_finder, + ((KallsymsFinder *)fn)->finder); + else + err = drgn_program_add_symbol_finder(&self->prog, py_symbol_find_fn, + fn); if (err) return set_drgn_error(err); Py_RETURN_NONE;