From e97e20bb6adcab619469f1b5ababbcb5b3b3ed16 Mon Sep 17 00:00:00 2001 From: tntexplosivesltd Date: Wed, 22 Nov 2023 23:11:45 +1300 Subject: [PATCH] Fix linting, move classes to utils --- .pylintrc | 4 +- tools/isledecomp/isledecomp/__init__.py | 3 + tools/isledecomp/isledecomp/dir.py | 42 + tools/isledecomp/isledecomp/parser/parser.py | 1 - tools/isledecomp/isledecomp/utils.py | 69 ++ tools/reccmp/reccmp.py | 764 +++++++++---------- 6 files changed, 467 insertions(+), 416 deletions(-) diff --git a/.pylintrc b/.pylintrc index 267cbb89..91e2143d 100644 --- a/.pylintrc +++ b/.pylintrc @@ -150,7 +150,7 @@ class-attribute-naming-style=any #class-attribute-rgx= # Naming style matching correct class constant names. -class-const-naming-style=snake_case +class-const-naming-style=UPPER_CASE # Regular expression matching correct class constant names. Overrides class- # const-naming-style. If left empty, class constant names will be checked with @@ -309,7 +309,7 @@ max-public-methods=20 max-returns=6 # Maximum number of statements in function / method body. -max-statements=50 +max-statements=75 # Minimum number of public methods for a class (see R0903). min-public-methods=0 diff --git a/tools/isledecomp/isledecomp/__init__.py b/tools/isledecomp/isledecomp/__init__.py index e69de29b..77de2b08 100644 --- a/tools/isledecomp/isledecomp/__init__.py +++ b/tools/isledecomp/isledecomp/__init__.py @@ -0,0 +1,3 @@ +from .dir import * +from .utils import * +from .parser import * diff --git a/tools/isledecomp/isledecomp/dir.py b/tools/isledecomp/isledecomp/dir.py index 505e1f9f..3ee95a87 100644 --- a/tools/isledecomp/isledecomp/dir.py +++ b/tools/isledecomp/isledecomp/dir.py @@ -1,7 +1,45 @@ import os +import subprocess +import sys from typing import Iterator +class WinePathConverter: + def __init__(self, unix_cwd): + self.unix_cwd = unix_cwd + self.win_cwd = self._call_winepath_unix2win(self.unix_cwd) + + def get_wine_path(self, unix_fn: str) -> str: + if unix_fn.startswith("./"): + return self.win_cwd + "\\" + unix_fn[2:].replace("/", "\\") + if unix_fn.startswith(self.unix_cwd): + return ( + self.win_cwd + + "\\" + + unix_fn.removeprefix(self.unix_cwd).replace("/", "\\").lstrip("\\") + ) + return self._call_winepath_unix2win(unix_fn) + + def get_unix_path(self, win_fn: str) -> str: + if win_fn.startswith(".\\") or win_fn.startswith("./"): + return self.unix_cwd + "/" + win_fn[2:].replace("\\", "/") + if win_fn.startswith(self.win_cwd): + return ( + self.unix_cwd + + "/" + + win_fn.removeprefix(self.win_cwd).replace("\\", "/") + ) + return self._call_winepath_win2unix(win_fn) + + @staticmethod + def _call_winepath_unix2win(fn: str) -> str: + return subprocess.check_output(["winepath", "-w", fn], text=True).strip() + + @staticmethod + def _call_winepath_win2unix(fn: str) -> str: + return subprocess.check_output(["winepath", fn], text=True).strip() + + def is_file_cpp(filename: str) -> bool: (_, ext) = os.path.splitext(filename) return ext.lower() in (".h", ".cpp") @@ -19,3 +57,7 @@ def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]: if not recursive: break + + +def get_file_in_script_dir(fn): + return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn) diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py index 0d02b27c..1039ac41 100644 --- a/tools/isledecomp/isledecomp/parser/parser.py +++ b/tools/isledecomp/isledecomp/parser/parser.py @@ -7,7 +7,6 @@ OffsetMatch, is_blank_or_comment, match_offset_comment, - is_exact_offset_comment, get_template_function_name, remove_trailing_comment, distinct_by_module, diff --git a/tools/isledecomp/isledecomp/utils.py b/tools/isledecomp/isledecomp/utils.py index f51ca06f..c7534cba 100644 --- a/tools/isledecomp/isledecomp/utils.py +++ b/tools/isledecomp/isledecomp/utils.py @@ -1,3 +1,5 @@ +import struct + import colorama @@ -20,3 +22,70 @@ def print_diff(udiff, plain): if not plain: print(colorama.Style.RESET_ALL, end="") return has_diff + + +# Declare a class that can automatically convert virtual executable addresses +# to file addresses +class Bin: + def __init__(self, filename, logger): + self.logger = logger + self.logger.debug('Parsing headers of "%s"... ', filename) + self.filename = filename + self.file = None + self.imagebase = None + self.textvirt = None + self.textraw = None + + def __enter__(self): + self.logger.debug(f"Bin {self.filename} Enter") + self.file = open(self.filename, "rb") + + # HACK: Strictly, we should be parsing the header, but we know where + # everything is in these two files so we just jump straight there + + # Read ImageBase + self.file.seek(0xB4) + (self.imagebase,) = struct.unpack("" + self.replacements[replace_addr] = replacement + return replacement diff --git a/tools/reccmp/reccmp.py b/tools/reccmp/reccmp.py index 2a642b0c..509a3e7f 100755 --- a/tools/reccmp/reccmp.py +++ b/tools/reccmp/reccmp.py @@ -7,186 +7,44 @@ import logging import os import re -import struct import subprocess import sys -from isledecomp.dir import walk_source_dir -from isledecomp.parser import find_code_blocks -from isledecomp.utils import print_diff +from isledecomp import ( + Bin, + find_code_blocks, + OffsetPlaceholderGenerator, + print_diff, + RecompiledInfo, + walk_source_dir, + WinePathConverter, +) from capstone import Cs, CS_ARCH_X86, CS_MODE_32 import colorama from pystache import Renderer -parser = argparse.ArgumentParser( - allow_abbrev=False, - description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.", +REGISTER_LIST = set( + [ + "ax", + "bp", + "bx", + "cx", + "di", + "dx", + "eax", + "ebp", + "ebx", + "ecx", + "edi", + "edx", + "esi", + "esp", + "si", + "sp", + ] ) -parser.add_argument("original", metavar="original-binary", help="The original binary") -parser.add_argument( - "recompiled", metavar="recompiled-binary", help="The recompiled binary" -) -parser.add_argument( - "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary" -) -parser.add_argument( - "decomp_dir", metavar="decomp-dir", help="The decompiled source tree" -) -parser.add_argument( - "--total", - "-T", - metavar="", - help="Total number of expected functions (improves total accuracy statistic)", -) -parser.add_argument( - "--verbose", - "-v", - metavar="", - help="Print assembly diff for specific function (original file's offset)", -) -parser.add_argument( - "--html", - "-H", - metavar="", - help="Generate searchable HTML summary of status and diffs", -) -parser.add_argument( - "--no-color", "-n", action="store_true", help="Do not color the output" -) -parser.add_argument( - "--svg", "-S", metavar="", help="Generate SVG graphic of progress" -) -parser.add_argument("--svg-icon", metavar="icon", help="Icon to use in SVG (PNG)") -parser.add_argument( - "--print-rec-addr", - action="store_true", - help="Print addresses of recompiled functions too", -) - -parser.set_defaults(loglevel=logging.INFO) -parser.add_argument( - "--debug", - action="store_const", - const=logging.DEBUG, - dest="loglevel", - help="Print script debug information", -) - -args = parser.parse_args() - -logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s") -logger = logging.getLogger(__name__) - -colorama.init() - -verbose = None -found_verbose_target = False -if args.verbose: - try: - verbose = int(args.verbose, 16) - except ValueError: - parser.error("invalid verbose argument") -html_path = args.html - -plain = args.no_color - -original = args.original -if not os.path.isfile(original): - parser.error(f"Original binary {original} does not exist") - -recomp = args.recompiled -if not os.path.isfile(recomp): - parser.error(f"Recompiled binary {recomp} does not exist") - -syms = args.pdb -if not os.path.isfile(syms): - parser.error(f"Symbols PDB {syms} does not exist") - -source = args.decomp_dir -if not os.path.isdir(source): - parser.error(f"Source directory {source} does not exist") - -svg = args.svg - - -# Declare a class that can automatically convert virtual executable addresses -# to file addresses -class Bin: - def __init__(self, filename): - logger.debug(f'Parsing headers of "{filename}"... ') - self.file = open(filename, "rb") - - # HACK: Strictly, we should be parsing the header, but we know where - # everything is in these two files so we just jump straight there - - # Read ImageBase - self.file.seek(0xB4) - (self.imagebase,) = struct.unpack(" str: - if unix_fn.startswith("./"): - return self.win_cwd + "\\" + unix_fn[2:].replace("/", "\\") - if unix_fn.startswith(self.unix_cwd): - return ( - self.win_cwd - + "\\" - + unix_fn.removeprefix(self.unix_cwd).replace("/", "\\").lstrip("\\") - ) - return self._call_winepath_unix2win(unix_fn) - - def get_unix_path(self, win_fn: str) -> str: - if win_fn.startswith(".\\") or win_fn.startswith("./"): - return self.unix_cwd + "/" + win_fn[2:].replace("\\", "/") - if win_fn.startswith(self.win_cwd): - return ( - self.unix_cwd - + "/" - + win_fn.removeprefix(self.win_cwd).replace("\\", "/") - ) - return self._call_winepath_win2unix(win_fn) - - @staticmethod - def _call_winepath_unix2win(fn: str) -> str: - return subprocess.check_output(["winepath", "-w", fn], text=True).strip() - - @staticmethod - def _call_winepath_win2unix(fn: str) -> str: - return subprocess.check_output(["winepath", fn], text=True).strip() +WORDS = re.compile(r"\w+") def get_file_in_script_dir(fn): @@ -199,33 +57,36 @@ class SymInfo: lines = {} names = {} - def __init__(self, pdb, file, wine_path_converter): + def __init__(self, pdb, sym_recompfile, sym_logger, sym_wine_path_converter=None): + self.logger = sym_logger call = [get_file_in_script_dir("cvdump.exe"), "-l", "-s"] - if wine_path_converter: + if sym_wine_path_converter: # Run cvdump through wine and convert path to Windows-friendly wine path call.insert(0, "wine") - call.append(wine_path_converter.get_wine_path(pdb)) + call.append(sym_wine_path_converter.get_wine_path(pdb)) else: call.append(pdb) - logger.info(f"Parsing {pdb} ...") - logger.debug(f"Command = {call}") + self.logger.info("Parsing %s ...", pdb) + self.logger.debug("Command = %s", call) line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n") current_section = None - logger.debug("Parsing output of cvdump.exe ...") + self.logger.debug("Parsing output of cvdump.exe ...") for i, line in enumerate(line_dump): if line.startswith("***"): current_section = line[4:] if current_section == "SYMBOLS" and "S_GPROC32" in line: - addr = int(line[26:34], 16) + sym_addr = int(line[26:34], 16) info = RecompiledInfo() - info.addr = addr + recompfile.imagebase + recompfile.textvirt + info.addr = ( + sym_addr + sym_recompfile.imagebase + sym_recompfile.textvirt + ) use_dbg_offs = False if use_dbg_offs: @@ -242,7 +103,7 @@ def __init__(self, pdb, file, wine_path_converter): info.name = line[77:] self.names[info.name] = info - self.funcs[addr] = info + self.funcs[sym_addr] = info elif ( current_section == "LINES" and line.startswith(" ") @@ -250,9 +111,9 @@ def __init__(self, pdb, file, wine_path_converter): ): sourcepath = line.split()[0] - if wine_path_converter: + if sym_wine_path_converter: # Convert filename to Unix path for file compare - sourcepath = wine_path_converter.get_unix_path(sourcepath) + sourcepath = sym_wine_path_converter.get_unix_path(sourcepath) if sourcepath not in self.lines: self.lines[sourcepath] = {} @@ -273,13 +134,12 @@ def __init__(self, pdb, file, wine_path_converter): j += 1 - logger.debug("... Parsing output of cvdump.exe finished") + self.logger.debug("... Parsing output of cvdump.exe finished") def get_recompiled_address(self, filename, line): - addr = None - found = False + recompiled_addr = None - logger.debug(f"Looking for {filename}:{line}") + self.logger.debug("Looking for %s:%s", filename, line) filename_basename = os.path.basename(filename).lower() for fn in self.lines: @@ -291,55 +151,32 @@ def get_recompiled_address(self, filename, line): ).lower() == filename_basename and os.path.samefile(fn, filename): filename = fn break - except FileNotFoundError as e: + except FileNotFoundError: continue - if filename in self.lines and line in self.lines[fn]: - addr = self.lines[fn][line] + if filename in self.lines and line in self.lines[filename]: + recompiled_addr = self.lines[filename][line] - if addr in self.funcs: - return self.funcs[addr] - else: - logger.error(f"Failed to find function symbol with address: 0x{addr:x}") - else: - logger.error( - f"Failed to find function symbol with filename and line: {filename}:{line}" + if recompiled_addr in self.funcs: + return self.funcs[recompiled_addr] + self.logger.error( + "Failed to find function symbol with address: %x", recompiled_addr ) + return None + self.logger.error( + "Failed to find function symbol with filename and line: %s:%s", + filename, + line, + ) + return None def get_recompiled_address_from_name(self, name): - logger.debug("Looking for %s", name) + self.logger.debug("Looking for %s", name) if name in self.names: return self.names[name] - else: - logger.error(f"Failed to find function symbol with name: {name}") - - -wine_path_converter = None -if os.name != "nt": - wine_path_converter = WinePathConverter(source) -origfile = Bin(original) -recompfile = Bin(recomp) -syminfo = SymInfo(syms, recompfile, wine_path_converter) - -print() - -md = Cs(CS_ARCH_X86, CS_MODE_32) - - -class OffsetPlaceholderGenerator: - def __init__(self): - self.counter = 0 - self.replacements = {} - - def get(self, addr): - if addr in self.replacements: - return self.replacements[addr] - else: - self.counter += 1 - replacement = f"" - self.replacements[addr] = replacement - return replacement + self.logger.error("Failed to find function symbol with name: %s", name) + return None def sanitize(file, placeholder_generator, mnemonic, op_str): @@ -350,7 +187,7 @@ def sanitize(file, placeholder_generator, mnemonic, op_str): except ValueError: pass - if (mnemonic == "call" or mnemonic == "jmp") and op_str_is_number: + if (mnemonic in ["call", "jmp"]) and op_str_is_number: # Filter out "calls" because the offsets we're not currently trying to # match offsets. As long as there's a call in the right place, it's # probably accurate. @@ -391,11 +228,11 @@ def filter_out_ptr(ptype, op_str): return mnemonic, op_str -def parse_asm(file, addr, size): +def parse_asm(disassembler, file, asm_addr, size): asm = [] - data = file.read(addr, size) + data = file.read(asm_addr, size) placeholder_generator = OffsetPlaceholderGenerator() - for i in md.disasm(data, 0): + for i in disassembler.disasm(data, 0): # Use heuristics to disregard some differences that aren't representative # of the accuracy of a function (e.g. global offsets) mnemonic, op_str = sanitize(file, placeholder_generator, i.mnemonic, i.op_str) @@ -406,29 +243,6 @@ def parse_asm(file, addr, size): return asm -REGISTER_LIST = set( - [ - "ax", - "bp", - "bx", - "cx", - "di", - "dx", - "eax", - "ebp", - "ebx", - "ecx", - "edi", - "edx", - "esi", - "esp", - "si", - "sp", - ] -) -WORDS = re.compile(r"\w+") - - def get_registers(line: str): to_replace = [] # use words regex to find all matching positions: @@ -460,15 +274,14 @@ def can_resolve_register_differences(original_asm, new_asm): return False # Look for the mismatching lines - for i in range(len(original_asm)): + for i, original_line in enumerate(original_asm): new_line = new_asm[i] - original_line = original_asm[i] if new_line != original_line: # Find all the registers to replace to_replace = get_registers(original_line) - for j in range(len(to_replace)): - (reg, reg_index) = to_replace[j] + for replace in to_replace: + (reg, reg_index) = replace replacing_reg = new_line[reg_index : reg_index + len(reg)] if replacing_reg in REGISTER_LIST: if replacing_reg != reg: @@ -481,147 +294,18 @@ def can_resolve_register_differences(original_asm, new_asm): # No replacement to do, different code, bail out return False # Check if the lines are now the same - for i in range(len(original_asm)): - if new_asm[i] != original_asm[i]: + for i, original_line in enumerate(original_asm): + if new_asm[i] != original_line: return False return True -function_count = 0 -total_accuracy = 0 -total_effective_accuracy = 0 -htmlinsert = [] - -# Generate basename of original file, used in locating OFFSET lines -basename = os.path.basename(os.path.splitext(original)[0]) - -for srcfilename in walk_source_dir(source): - with open(srcfilename, "r") as srcfile: - blocks = find_code_blocks(srcfile) - - for block in blocks: - if block.is_stub: - continue - - if block.module != basename: - continue - - addr = block.offset - # Verbose flag handling - if verbose: - if addr == verbose: - found_verbose_target = True - else: - continue - - if block.is_template: - recinfo = syminfo.get_recompiled_address_from_name(block.signature) - if not recinfo: - continue - else: - recinfo = syminfo.get_recompiled_address(srcfilename, block.start_line) - if not recinfo: - continue - - # The effective_ratio is the ratio when ignoring differing register - # allocation vs the ratio is the true ratio. - ratio = 0.0 - effective_ratio = 0.0 - if recinfo.size: - origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size) - recompasm = parse_asm( - recompfile, recinfo.addr + recinfo.start, recinfo.size - ) - - diff = difflib.SequenceMatcher(None, origasm, recompasm) - ratio = diff.ratio() - effective_ratio = ratio - - if ratio != 1.0: - # Check whether we can resolve register swaps which are actually - # perfect matches modulo compiler entropy. - if can_resolve_register_differences(origasm, recompasm): - effective_ratio = 1.0 - else: - ratio = 0 - - percenttext = f"{(effective_ratio * 100):.2f}%" - if not plain: - if effective_ratio == 1.0: - percenttext = ( - colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL - ) - elif effective_ratio > 0.8: - percenttext = ( - colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL - ) - else: - percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL - - if effective_ratio == 1.0 and ratio != 1.0: - if plain: - percenttext += "*" - else: - percenttext += colorama.Fore.RED + "*" + colorama.Style.RESET_ALL - - if args.print_rec_addr: - addrs = f"0x{addr:x} / 0x{recinfo.addr:x}" - else: - addrs = hex(addr) - - if not verbose: - print( - f" {recinfo.name} ({addrs}) is {percenttext} similar to the original" - ) - - function_count += 1 - total_accuracy += ratio - total_effective_accuracy += effective_ratio - - if recinfo.size: - udiff = difflib.unified_diff(origasm, recompasm, n=10) - - # If verbose, print the diff for that function to the output - if verbose: - if effective_ratio == 1.0: - ok_text = ( - "OK!" - if plain - else ( - colorama.Fore.GREEN + "✨ OK! ✨" + colorama.Style.RESET_ALL - ) - ) - if ratio == 1.0: - print(f"{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n") - else: - print( - f"{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n" - ) - else: - print_diff(udiff, plain) - - print( - f"\n{recinfo.name} is only {percenttext} similar to the original, diff above" - ) - - # If html, record the diffs to an HTML file - if html_path: - htmlinsert.append( - { - "address": f"0x{addr:x}", - "name": recinfo.name, - "matching": effective_ratio, - "diff": "\n".join(udiff), - } - ) - - def gen_html(html_file, data): output_data = Renderer().render_path( get_file_in_script_dir("template.html"), {"data": data} ) - with open(html_file, "w") as htmlfile: + with open(html_file, "w", encoding="utf-8") as htmlfile: htmlfile.write(output_data) @@ -644,35 +328,289 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac "percent": f"{(total_statistic * 100):.2f}%", }, ) - with open(svg_file, "w") as svgfile: + with open(svg_file, "w", encoding="utf-8") as svgfile: svgfile.write(output_data) -if html_path: - gen_html(html_path, json.dumps(htmlinsert)) +# Do the actual work +if __name__ == "__main__": + parser = argparse.ArgumentParser( + allow_abbrev=False, + description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.", + ) + parser.add_argument( + "original", metavar="original-binary", help="The original binary" + ) + parser.add_argument( + "recompiled", metavar="recompiled-binary", help="The recompiled binary" + ) + parser.add_argument( + "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary" + ) + parser.add_argument( + "decomp_dir", metavar="decomp-dir", help="The decompiled source tree" + ) + parser.add_argument( + "--total", + "-T", + metavar="", + help="Total number of expected functions (improves total accuracy statistic)", + ) + parser.add_argument( + "--verbose", + "-v", + metavar="", + help="Print assembly diff for specific function (original file's offset)", + ) + parser.add_argument( + "--html", + "-H", + metavar="", + help="Generate searchable HTML summary of status and diffs", + ) + parser.add_argument( + "--no-color", "-n", action="store_true", help="Do not color the output" + ) + parser.add_argument( + "--svg", "-S", metavar="", help="Generate SVG graphic of progress" + ) + parser.add_argument("--svg-icon", metavar="icon", help="Icon to use in SVG (PNG)") + parser.add_argument( + "--print-rec-addr", + action="store_true", + help="Print addresses of recompiled functions too", + ) -if verbose: - if not found_verbose_target: - print(f"Failed to find the function with address 0x{verbose:x}") -else: - implemented_funcs = function_count + parser.set_defaults(loglevel=logging.INFO) + parser.add_argument( + "--debug", + action="store_const", + const=logging.DEBUG, + dest="loglevel", + help="Print script debug information", + ) - if args.total: - function_count = int(args.total) + args = parser.parse_args() - if function_count > 0: - effective_accuracy = total_effective_accuracy / function_count * 100 - actual_accuracy = total_accuracy / function_count * 100 - print( - f"\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)" + logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s") + logger = logging.getLogger(__name__) + + colorama.init() + + verbose = None + found_verbose_target = False + if args.verbose: + try: + verbose = int(args.verbose, 16) + except ValueError: + parser.error("invalid verbose argument") + html_path = args.html + + plain = args.no_color + + original = args.original + if not os.path.isfile(original): + parser.error(f"Original binary {original} does not exist") + + recomp = args.recompiled + if not os.path.isfile(recomp): + parser.error(f"Recompiled binary {recomp} does not exist") + + syms = args.pdb + if not os.path.isfile(syms): + parser.error(f"Symbols PDB {syms} does not exist") + + source = args.decomp_dir + if not os.path.isdir(source): + parser.error(f"Source directory {source} does not exist") + + svg = args.svg + + wine_path_converter = None + if os.name != "nt": + wine_path_converter = WinePathConverter(source) + with Bin(original, logger) as origfile, Bin(recomp, logger) as recompfile: + syminfo = SymInfo( + syms, recompfile, logger, sym_wine_path_converter=wine_path_converter ) - if svg: - gen_svg( - svg, - os.path.basename(original), - args.svg_icon, - implemented_funcs, - function_count, - total_effective_accuracy, - ) + print() + + capstone_disassembler = Cs(CS_ARCH_X86, CS_MODE_32) + + function_count = 0 + total_accuracy = 0 + total_effective_accuracy = 0 + htmlinsert = [] + + # Generate basename of original file, used in locating OFFSET lines + basename = os.path.basename(os.path.splitext(original)[0]) + + for srcfilename in walk_source_dir(source): + with open(srcfilename, "r", encoding="utf-8") as srcfile: + blocks = find_code_blocks(srcfile) + + for block in blocks: + if block.is_stub: + continue + + if block.module != basename: + continue + + addr = block.offset + # Verbose flag handling + if verbose: + if addr == verbose: + found_verbose_target = True + else: + continue + + if block.is_template: + recinfo = syminfo.get_recompiled_address_from_name(block.signature) + if not recinfo: + continue + else: + recinfo = syminfo.get_recompiled_address( + srcfilename, block.start_line + ) + if not recinfo: + continue + + # The effective_ratio is the ratio when ignoring differing register + # allocation vs the ratio is the true ratio. + ratio = 0.0 + effective_ratio = 0.0 + if recinfo.size: + origasm = parse_asm( + capstone_disassembler, + origfile, + addr + recinfo.start, + recinfo.size, + ) + recompasm = parse_asm( + capstone_disassembler, + recompfile, + recinfo.addr + recinfo.start, + recinfo.size, + ) + + diff = difflib.SequenceMatcher(None, origasm, recompasm) + ratio = diff.ratio() + effective_ratio = ratio + + if ratio != 1.0: + # Check whether we can resolve register swaps which are actually + # perfect matches modulo compiler entropy. + if can_resolve_register_differences(origasm, recompasm): + effective_ratio = 1.0 + else: + ratio = 0 + + percenttext = f"{(effective_ratio * 100):.2f}%" + if not plain: + if effective_ratio == 1.0: + percenttext = ( + colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL + ) + elif effective_ratio > 0.8: + percenttext = ( + colorama.Fore.YELLOW + + percenttext + + colorama.Style.RESET_ALL + ) + else: + percenttext = ( + colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL + ) + + if effective_ratio == 1.0 and ratio != 1.0: + if plain: + percenttext += "*" + else: + percenttext += ( + colorama.Fore.RED + "*" + colorama.Style.RESET_ALL + ) + + if args.print_rec_addr: + addrs = f"0x{addr:x} / 0x{recinfo.addr:x}" + else: + addrs = hex(addr) + + if not verbose: + print( + f" {recinfo.name} ({addrs}) is {percenttext} similar to the original" + ) + + function_count += 1 + total_accuracy += ratio + total_effective_accuracy += effective_ratio + + if recinfo.size: + udiff = difflib.unified_diff(origasm, recompasm, n=10) + + # If verbose, print the diff for that function to the output + if verbose: + if effective_ratio == 1.0: + ok_text = ( + "OK!" + if plain + else ( + colorama.Fore.GREEN + + "✨ OK! ✨" + + colorama.Style.RESET_ALL + ) + ) + if ratio == 1.0: + print( + f"{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n" + ) + else: + print( + f"{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n" + ) + else: + print_diff(udiff, plain) + + print( + f"\n{recinfo.name} is only {percenttext} similar to the original, diff above" + ) + + # If html, record the diffs to an HTML file + if html_path: + htmlinsert.append( + { + "address": f"0x{addr:x}", + "name": recinfo.name, + "matching": effective_ratio, + "diff": "\n".join(udiff), + } + ) + + if html_path: + gen_html(html_path, json.dumps(htmlinsert)) + + if verbose: + if not found_verbose_target: + print(f"Failed to find the function with address 0x{verbose:x}") + else: + implemented_funcs = function_count + + if args.total: + function_count = int(args.total) + + if function_count > 0: + effective_accuracy = total_effective_accuracy / function_count * 100 + actual_accuracy = total_accuracy / function_count * 100 + print( + f"\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)" + ) + + if svg: + gen_svg( + svg, + os.path.basename(original), + args.svg_icon, + implemented_funcs, + function_count, + total_effective_accuracy, + )