Brute force string search for BETA10

This commit is contained in:
disinvite 2024-09-01 14:16:53 -04:00
parent f242130382
commit 9545493bc9
2 changed files with 37 additions and 2 deletions

View File

@ -465,6 +465,22 @@ def _populate_exports(self, export_rva: int, _: int):
for (func_addr, name_addr) in combined
]
def iter_string(self, encoding: str = "ascii") -> Iterator[Tuple[int, str]]:
"""Search for possible strings at each verified address in .data."""
section = self.get_section_by_name(".data")
for addr in self._relocated_addrs:
if section.contains_vaddr(addr):
raw = self.read_string(addr)
if raw is None:
continue
try:
string = raw.decode(encoding)
except UnicodeDecodeError:
continue
yield (addr, string)
def get_section_by_name(self, name: str) -> Section:
section = next(
filter(lambda section: section.match_name(name), self.sections),

View File

@ -82,8 +82,9 @@ def __init__(
self._load_cvdump()
self._load_markers()
self._find_original_strings()
# Detect floats first to eliminate potential overlap with string data
self._find_float_const()
self._find_original_strings()
self._match_imports()
self._match_exports()
self._match_thunks()
@ -314,7 +315,7 @@ def _find_original_strings(self):
"""Go to the original binary and look for the specified string constants
to find a match. This is a (relatively) expensive operation so we only
look at strings that we have not already matched via a STRING annotation."""
# Release builds give each de-duped string a symbol so they are easy to find and match.
for string in self._db.get_unmatched_strings():
addr = self.orig_bin.find_string(string.encode("latin1"))
if addr is None:
@ -324,6 +325,24 @@ def _find_original_strings(self):
self._db.match_string(addr, string)
# Debug builds do not de-dupe the strings, so we need to find them via brute force scan.
# We could try to match the string addrs if there is only one in orig and recomp.
# When we sanitize the asm, the result is the same regardless.
if self.orig_bin.is_debug:
for addr, string in self.orig_bin.iter_string("latin1"):
# Arbitrary threshold of 4, but I think this is what Ghidra does too
if len(string) > 4 and string[0].isalnum():
self._db.set_orig_symbol(
addr, SymbolType.STRING, string, len(string)
)
if self.recomp_bin.is_debug:
for addr, string in self.recomp_bin.iter_string("latin1"):
if len(string) > 4 and string[0].isalnum():
self._db.set_recomp_symbol(
addr, SymbolType.STRING, string, None, len(string)
)
def _find_float_const(self):
"""Add floating point constants in each binary to the database.
We are not matching anything right now because these values are not