Brute force string search for BETA10

This commit is contained in:
disinvite 2024-09-01 14:16:53 -04:00
parent f242130382
commit 9545493bc9
2 changed files with 37 additions and 2 deletions

View File

@ -465,6 +465,22 @@ def _populate_exports(self, export_rva: int, _: int):
for (func_addr, name_addr) in combined for (func_addr, name_addr) in combined
] ]
def iter_string(self, encoding: str = "ascii") -> Iterator[Tuple[int, str]]:
"""Search for possible strings at each verified address in .data."""
section = self.get_section_by_name(".data")
for addr in self._relocated_addrs:
if section.contains_vaddr(addr):
raw = self.read_string(addr)
if raw is None:
continue
try:
string = raw.decode(encoding)
except UnicodeDecodeError:
continue
yield (addr, string)
def get_section_by_name(self, name: str) -> Section: def get_section_by_name(self, name: str) -> Section:
section = next( section = next(
filter(lambda section: section.match_name(name), self.sections), filter(lambda section: section.match_name(name), self.sections),

View File

@ -82,8 +82,9 @@ def __init__(
self._load_cvdump() self._load_cvdump()
self._load_markers() self._load_markers()
self._find_original_strings() # Detect floats first to eliminate potential overlap with string data
self._find_float_const() self._find_float_const()
self._find_original_strings()
self._match_imports() self._match_imports()
self._match_exports() self._match_exports()
self._match_thunks() self._match_thunks()
@ -314,7 +315,7 @@ def _find_original_strings(self):
"""Go to the original binary and look for the specified string constants """Go to the original binary and look for the specified string constants
to find a match. This is a (relatively) expensive operation so we only to find a match. This is a (relatively) expensive operation so we only
look at strings that we have not already matched via a STRING annotation.""" look at strings that we have not already matched via a STRING annotation."""
# Release builds give each de-duped string a symbol so they are easy to find and match.
for string in self._db.get_unmatched_strings(): for string in self._db.get_unmatched_strings():
addr = self.orig_bin.find_string(string.encode("latin1")) addr = self.orig_bin.find_string(string.encode("latin1"))
if addr is None: if addr is None:
@ -324,6 +325,24 @@ def _find_original_strings(self):
self._db.match_string(addr, string) self._db.match_string(addr, string)
# Debug builds do not de-dupe the strings, so we need to find them via brute force scan.
# We could try to match the string addrs if there is only one in orig and recomp.
# When we sanitize the asm, the result is the same regardless.
if self.orig_bin.is_debug:
for addr, string in self.orig_bin.iter_string("latin1"):
# Arbitrary threshold of 4, but I think this is what Ghidra does too
if len(string) > 4 and string[0].isalnum():
self._db.set_orig_symbol(
addr, SymbolType.STRING, string, len(string)
)
if self.recomp_bin.is_debug:
for addr, string in self.recomp_bin.iter_string("latin1"):
if len(string) > 4 and string[0].isalnum():
self._db.set_recomp_symbol(
addr, SymbolType.STRING, string, None, len(string)
)
def _find_float_const(self): def _find_float_const(self):
"""Add floating point constants in each binary to the database. """Add floating point constants in each binary to the database.
We are not matching anything right now because these values are not We are not matching anything right now because these values are not