From e59afb48252da7aa67cda1bab83266cad048bf73 Mon Sep 17 00:00:00 2001 From: disinvite Date: Sun, 14 Jan 2024 12:22:14 -0500 Subject: [PATCH] Find imports and thunk functions --- tools/isledecomp/isledecomp/bin.py | 88 +++++++++++++++++++++ tools/isledecomp/isledecomp/compare/core.py | 41 ++++++++++ tools/isledecomp/isledecomp/compare/db.py | 12 ++- 3 files changed, 138 insertions(+), 3 deletions(-) diff --git a/tools/isledecomp/isledecomp/bin.py b/tools/isledecomp/isledecomp/bin.py index 26dd00f8..3b600af6 100644 --- a/tools/isledecomp/isledecomp/bin.py +++ b/tools/isledecomp/isledecomp/bin.py @@ -97,6 +97,8 @@ def __init__(self, filename: str, find_str: bool = False) -> None: self.find_str = find_str self._potential_strings = {} self._relocated_addrs = set() + self.imports = [] + self.thunks = [] def __enter__(self): logger.debug("Bin %s Enter", self.filename) @@ -132,6 +134,8 @@ def __enter__(self): sect.virtual_address += self.imagebase self._populate_relocations() + self._populate_imports() + self._populate_thunks() # This is a (semi) expensive lookup that is not necesssary in every case. # We can find strings in the original if we have coverage using STRING markers. @@ -238,6 +242,78 @@ def _populate_relocations(self): (relocated_addr,) = struct.unpack(" bool: return section is not None + def read_string(self, offset: int, chunk_size: int = 1000) -> Optional[bytes]: + """Read until we find a zero byte.""" + b = self.read(offset, chunk_size) + if b is None: + return None + + try: + return b[: b.index(b"\x00")] + except ValueError: + # No terminator found, just return what we have + return b + def read(self, offset: int, size: int) -> Optional[bytes]: """Read (at most) the given number of bytes at the given virtual address. If we return None, the given address points to uninitialized data.""" diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py index 1e58d2cb..40b93bae 100644 --- a/tools/isledecomp/isledecomp/compare/core.py +++ b/tools/isledecomp/isledecomp/compare/core.py @@ -48,6 +48,7 @@ def __init__(self, orig_bin, recomp_bin, pdb_file, code_dir): self._load_cvdump() self._load_markers() self._find_original_strings() + self._match_thunks() def _load_cvdump(self): logger.info("Parsing %s ...", self.pdb_file) @@ -147,6 +148,46 @@ def _find_original_strings(self): self._db.match_string(addr, string) + def _match_thunks(self): + orig_byaddr = { + addr: (dll.upper(), name) for (dll, name, addr) in self.orig_bin.imports + } + recomp_byname = { + (dll.upper(), name): addr for (dll, name, addr) in self.recomp_bin.imports + } + # Combine these two dictionaries. We don't care about imports from recomp + # not found in orig because: + # 1. They shouldn't be there + # 2. They are already identified via cvdump + orig_to_recomp = { + addr: recomp_byname.get(pair, None) for addr, pair in orig_byaddr.items() + } + + # Now: we have the IAT offset in each matched up, so we need to make + # the connection between the thunk functions. + # We already have the symbol name we need from the PDB. + orig_thunks = { + iat_ofs: func_ofs for (func_ofs, iat_ofs) in self.orig_bin.thunks + } + recomp_thunks = { + iat_ofs: func_ofs for (func_ofs, iat_ofs) in self.recomp_bin.thunks + } + + for orig, recomp in orig_to_recomp.items(): + self._db.set_pair(orig, recomp, SymbolType.POINTER) + thunk_from_orig = orig_thunks.get(orig, None) + thunk_from_recomp = recomp_thunks.get(recomp, None) + + if thunk_from_orig is not None and thunk_from_recomp is not None: + self._db.set_function_pair(thunk_from_orig, thunk_from_recomp) + # Don't compare thunk functions for now. The comparison isn't + # "useful" in the usual sense. We are only looking at the 6 + # bytes of the jmp instruction and not the larger context of + # where this function is. Also: these will always match 100% + # because we are searching for a match to register this as a + # function in the first place. + self._db.skip_compare(thunk_from_orig) + def get_one_function(self, addr: int) -> Optional[MatchInfo]: """i.e. verbose mode for reccmp""" return self._db.get_one_function(addr) diff --git a/tools/isledecomp/isledecomp/compare/db.py b/tools/isledecomp/isledecomp/compare/db.py index 3cd25bf7..8cba0c03 100644 --- a/tools/isledecomp/isledecomp/compare/db.py +++ b/tools/isledecomp/isledecomp/compare/db.py @@ -134,14 +134,20 @@ def get_matches(self, compare_type: SymbolType) -> List[MatchInfo]: return cur.fetchall() - def set_function_pair(self, orig: int, recomp: int) -> bool: - """For lineref match or _entry""" + def set_pair( + self, orig: int, recomp: int, compare_type: Optional[SymbolType] = None + ) -> bool: + compare_value = compare_type.value if compare_type is not None else None cur = self._db.execute( "UPDATE `symbols` SET orig_addr = ?, compare_type = ? WHERE recomp_addr = ?", - (orig, SymbolType.FUNCTION.value, recomp), + (orig, compare_value, recomp), ) return cur.rowcount > 0 + + def set_function_pair(self, orig: int, recomp: int) -> bool: + """For lineref match or _entry""" + self.set_pair(orig, recomp, SymbolType.FUNCTION) # TODO: Both ways required? def skip_compare(self, orig: int):