diff --git a/tools/ghidra_scripts/README.md b/tools/ghidra_scripts/README.md index 7bd5133e..ebf32da8 100644 --- a/tools/ghidra_scripts/README.md +++ b/tools/ghidra_scripts/README.md @@ -17,4 +17,6 @@ Since these scripts and its dependencies are written in Python 3, [Ghidrathon](h ## Development - Type hints for Ghira (optional): Download a recent release from https://github.com/VDOO-Connected-Trust/ghidra-pyi-generator, unpack it somewhere, and `pip install` that directory in this virtual environment. This provides types and headers for Python. -- Note that as of 2024-05-20 there is a [bug](https://github.com/mandiant/Ghidrathon/issues/103) in Ghidrathon v4.0.0: Changes in dependent scripts are not detected. If you modify a file that is imported by the script, you must restart Ghidra for the change to have any effect. + Be aware that some of these files contain errors - in particular, `from typing import overload` seems to be missing everywhere, leading to spurious type errors. +- Note that the imported modules persist across multiple runs of the script (see [here](https://github.com/mandiant/Ghidrathon/issues/103)). + If you indend to modify an imported library, you have to use `import importlib; importlib.reload(${library})` or restart Ghidra for your changes to have any effect. diff --git a/tools/ghidra_scripts/import_functions_from_pdb.py b/tools/ghidra_scripts/import_functions_from_pdb.py index 6395e4b6..0a970eae 100644 --- a/tools/ghidra_scripts/import_functions_from_pdb.py +++ b/tools/ghidra_scripts/import_functions_from_pdb.py @@ -9,6 +9,11 @@ # @menupath # @toolbar + +# pylint: disable=wrong-import-position,ungrouped-imports +# pylint: disable=undefined-variable # need to disable this one globally because pylint does not understand e.g. askYesNo() + +import importlib from dataclasses import dataclass, field import sys import logging @@ -16,14 +21,25 @@ import traceback from typing import TYPE_CHECKING -from lego_util.exceptions import Lego1Exception -from lego_util.statistics import Statistics -# pylint: disable=undefined-variable # need to disable this one globally because pylint does not understand e.g. askYesNo() if TYPE_CHECKING: import ghidra from lego_util.headers import * # pylint: disable=wildcard-import + +def reload_module(module: str): + """ + Due to a a quirk in Jep (used by Ghidrathon), imported modules persist for the lifetime of the Ghidra process + and are not reloaded when relaunching the script. Therefore, in order to facilitate development + we force reload all our own modules at startup. + """ + importlib.reload(importlib.import_module(module)) + + +reload_module("lego_util.statistics") +from lego_util.statistics import Statistics + + logger = logging.getLogger(__name__) @@ -58,18 +74,7 @@ class Globals: # In the latter case, only the C++ parser can be used. setup_logging() try: - - # this one contains actual code - from lego_util.ghidra_helper import ( - get_ghidra_namespace, - get_ghidra_type, - ) - - from ghidra.program.model.listing import Function, Parameter from ghidra.program.flatapi import FlatProgramAPI - from ghidra.program.model.listing import ParameterImpl - from ghidra.program.model.listing import Function - from ghidra.program.model.symbol import SourceType from ghidra.util.exception import CancelledException GLOBALS.make_changes = askYesNo( @@ -82,11 +87,13 @@ class Globals: ) GLOBALS.running_from_ghidra = True -except ImportError: +except ImportError as importError: logger.error( "Failed to import Ghidra functions, doing a dry run for the source code parser. " "Has this script been launched from Ghidra?" ) + logger.debug("Precise import error:", exc_info=importError) + GLOBALS.running_from_ghidra = False CancelledException = None @@ -102,222 +109,18 @@ def add_python_path(path: str): sys.path.insert(1, str(venv_path)) -class PdbFunctionWithGhidraObjects: - """A representation of a function from the PDB with each type replaced by a Ghidra type instance.""" - def __init__( - self, - fpapi: "FlatProgramAPI", - match_info: "MatchInfo", - signature: "FunctionSignature", - ): - self.api = fpapi - self.match_info = match_info - self.signature = signature - - assert match_info.name is not None - colon_split = match_info.name.split("::") - self.name = colon_split.pop() - namespace_hierachy = colon_split - self.namespace = get_ghidra_namespace(fpapi, namespace_hierachy) - - self.return_type = get_ghidra_type(fpapi, signature.return_type) - self.arguments = [ - ParameterImpl( - f"param{index}", - get_ghidra_type(fpapi, type_name), - fpapi.getCurrentProgram(), - ) - for (index, type_name) in enumerate(signature.arglist) - ] - - @property - def call_type(self): - return self.signature.call_type - - @property - def stack_symbols(self): - return self.signature.stack_symbols - - def get_full_name(self) -> str: - return f"{self.namespace.getName()}::{self.name}" - - def format_proposed_change(self) -> str: - return ( - f"{self.return_type} {self.call_type} {self.get_full_name()}" - + f"({', '.join(self.signature.arglist)})" - ) - - def matches_ghidra_function(self, ghidra_function): # type: (Function) -> bool - """Checks whether this function declaration already matches the description in Ghidra""" - name_match = self.name == ghidra_function.getName(False) - namespace_match = self.namespace == ghidra_function.getParentNamespace() - return_type_match = self.return_type == ghidra_function.getReturnType() - # match arguments: decide if thiscall or not - thiscall_matches = ( - self.signature.call_type == ghidra_function.getCallingConventionName() - ) - - if thiscall_matches: - if self.signature.call_type == "__thiscall": - args_match = self._matches_thiscall_parameters(ghidra_function) - else: - args_match = self._matches_non_thiscall_parameters(ghidra_function) - else: - args_match = False - - logger.debug( - "Matches: namespace=%s name=%s return_type=%s thiscall=%s args=%s", - namespace_match, - name_match, - return_type_match, - thiscall_matches, - args_match, - ) - - return ( - name_match - and namespace_match - and return_type_match - and thiscall_matches - and args_match - ) - - def _matches_non_thiscall_parameters( - self, ghidra_function - ): # type: (Function) -> bool - return self._parameter_lists_match(ghidra_function.getParameters()) - - def _matches_thiscall_parameters(self, ghidra_function: "Function") -> bool: - ghidra_params = list(ghidra_function.getParameters()) - - # remove the `this` argument which we don't generate ourselves - ghidra_params.pop(0) - - return self._parameter_lists_match(ghidra_params) - - def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool: - if len(self.arguments) != len(ghidra_params): - logger.info("Mismatching argument count") - return False - - for this_arg, ghidra_arg in zip(self.arguments, ghidra_params): - # compare argument types - if this_arg.getDataType() != ghidra_arg.getDataType(): - logger.debug( - "Mismatching arg type: expected %s, found %s", - this_arg.getDataType(), - ghidra_arg.getDataType(), - ) - return False - # compare argument names - stack_match = self.get_matching_stack_symbol(ghidra_arg.getStackOffset()) - if stack_match is None: - logger.debug("Not found on stack: %s", ghidra_arg) - return False - # "__formal" is the placeholder for arguments without a name - if stack_match.name not in ["__formal", ghidra_arg.getName()]: - logger.debug( - "Argument name mismatch: expected %s, found %s", - stack_match.name, - ghidra_arg.getName(), - ) - return False - return True - - def overwrite_ghidra_function(self, ghidra_function): # type: (Function) -> None - """Replace the function declaration in Ghidra by the one derived from C++.""" - ghidra_function.setName(self.name, SourceType.USER_DEFINED) - ghidra_function.setParentNamespace(self.namespace) - ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED) - ghidra_function.setCallingConvention(self.call_type) - - ghidra_function.replaceParameters( - Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS, - True, - SourceType.USER_DEFINED, - self.arguments, - ) - - # When we set the parameters, Ghidra will generate the layout. - # Now we read them again and match them against the stack layout in the PDB, - # both to verify and to set the parameter names. - ghidra_parameters: "list[ghidra.program.model.listing.Parameter]" = ghidra_function.getParameters() # type: ignore - - # Try to add Ghidra function names - for param in ghidra_parameters: - if param.isStackVariable(): - self._rename_stack_parameter(param) - else: - if param.getName() == "this": - # 'this' parameters are auto-generated and cannot be changed - continue - - # TODO: Does this ever happen? - logger.warning("Unhandled register variable in %s", self.get_full_name) - continue - - # Old code for reference: - # - # register = param.getRegister().getName().lower() - # match = self.get_matching_register_symbol(register) - # if match is None: - # logger.error( - # "Could not match register parameter %s to known symbols %s", - # param, - # self.stack_symbols, - # ) - # continue - - def _rename_stack_parameter(self, param: "Parameter"): - match = self.get_matching_stack_symbol(param.getStackOffset()) - if match is None: - raise StackOffsetMismatchError( - f"Could not find a matching symbol at offset {param.getStackOffset()} in {self.get_full_name()}" - ) - - if param.getDataType() != get_ghidra_type(self.api, match.data_type): - logger.error( - "Type mismatch for parameter: %s in Ghidra, %s in PDB", param, match - ) - return - - param.setName(match.name, SourceType.USER_DEFINED) - - def get_matching_stack_symbol(self, stack_offset: int) -> "CppStackSymbol | None": - return next( - ( - symbol - for symbol in self.stack_symbols - if isinstance(symbol, CppStackSymbol) - and symbol.stack_offset == stack_offset - ), - None, - ) - - def get_matching_register_symbol(self, register: str) -> "CppRegisterSymbol | None": - return next( - ( - symbol - for symbol in self.stack_symbols - if isinstance(symbol, CppRegisterSymbol) and symbol.register == register - ), - None, - ) - - -def handle_function_in_ghidra(match_info: "MatchInfo", signature: "FunctionSignature"): - - if not GLOBALS.running_from_ghidra: - return +def migrate_function_to_ghidra( + api: "FlatProgramAPI", + match_info: "MatchInfo", + signature: "FunctionSignature" +): hex_original_address = f"{match_info.orig_addr:x}" # Find the Ghidra function at that address - ghidra_address = getAddressFactory().getAddress(hex_original_address) # type: ignore + ghidra_address = getAddressFactory().getAddress(hex_original_address) - fpapi = FlatProgramAPI(currentProgram()) # type: ignore - - typed_pdb_function = PdbFunctionWithGhidraObjects(fpapi, match_info, signature) + typed_pdb_function = PdbFunctionWithGhidraObjects(api, match_info, signature) if not GLOBALS.make_changes: return @@ -354,7 +157,7 @@ def handle_function_in_ghidra(match_info: "MatchInfo", signature: "FunctionSigna logger.critical("User quit, terminating") raise SystemExit(1) - # logger.info("Modifying function %s at 0x%s", cpp_function.full_name(), address) + logger.debug("Modifying function %s at 0x%s", typed_pdb_function.get_full_name(), hex_original_address) typed_pdb_function.overwrite_ghidra_function(ghidra_function) @@ -362,16 +165,22 @@ def handle_function_in_ghidra(match_info: "MatchInfo", signature: "FunctionSigna if GLOBALS.prompt_before_changes: # Add a prompt so we can verify the result immediately - askChoice("", "Click 'OK' to continue", ["OK"], "OK") + askChoice("Continue", "Click 'OK' to continue", ["OK"], "OK") -def handle_function_list(isle_compare: "IsleCompare"): +def process_functions(isle_compare: "IsleCompare"): # try to acquire matched functions migration = PdbExtractionForGhidraMigration(isle_compare) func_signatures = migration.get_function_list() + + if not GLOBALS.running_from_ghidra: + logger.info("Completed the dry run outside Ghidra.") + return + + fpapi = FlatProgramAPI(currentProgram()) for match_info, signature in func_signatures: try: - handle_function_in_ghidra(match_info, signature) + migrate_function_to_ghidra(fpapi, match_info, signature) GLOBALS.statistics.successes += 1 except Lego1Exception as e: log_and_track_failure(e) @@ -415,9 +224,10 @@ def main(): logger.info("Comparison complete.") try: - handle_function_list(isle_compare) + process_functions(isle_compare) finally: - GLOBALS.statistics.log() + if GLOBALS.running_from_ghidra: + GLOBALS.statistics.log() logger.info("Done") @@ -425,24 +235,31 @@ def main(): # sys.path is not reset after running the script, so we should restore it sys_path_backup = sys.path.copy() try: - add_python_path( - ".venv/Lib/site-packages" - ) # make modules installed in the venv available in Ghidra - add_python_path( - "tools/isledecomp" - ) # needed when isledecomp is installed in editable mode in the venv + # make modules installed in the venv available in Ghidra + add_python_path(".venv/Lib/site-packages") + # This one is needed when isledecomp is installed in editable mode in the venv + add_python_path("tools/isledecomp") import setuptools # pylint: disable=unused-import # required to fix a distutils issue in Python 3.12 + reload_module("isledecomp") from isledecomp import Bin + reload_module("isledecomp.compare") from isledecomp.compare import Compare as IsleCompare + reload_module("isledecomp.compare.db") from isledecomp.compare.db import MatchInfo - from lego_util.pdb_extraction import ( # pylint: disable=ungrouped-imports # these must be imported + + reload_module("lego_util.exceptions") + from lego_util.exceptions import Lego1Exception + reload_module("lego_util.pdb_extraction") + from lego_util.pdb_extraction import ( PdbExtractionForGhidraMigration, FunctionSignature, - CppRegisterSymbol, - CppStackSymbol, ) - from lego_util.exceptions import StackOffsetMismatchError + + if GLOBALS.running_from_ghidra: + reload_module("lego_util.pdb_to_ghidra") + from lego_util.pdb_to_ghidra import PdbFunctionWithGhidraObjects + if __name__ == "__main__": main() diff --git a/tools/ghidra_scripts/lego_util/pdb_to_ghidra.py b/tools/ghidra_scripts/lego_util/pdb_to_ghidra.py new file mode 100644 index 00000000..ab0c0116 --- /dev/null +++ b/tools/ghidra_scripts/lego_util/pdb_to_ghidra.py @@ -0,0 +1,232 @@ +# This file can only be imported successfully when run from Ghidra using Ghidrathon. + +# Disable spurious warnings in vscode / pylance +# pyright: reportMissingModuleSource=false + +import logging + +from ghidra.program.model.listing import Function, Parameter +from ghidra.program.flatapi import FlatProgramAPI +from ghidra.program.model.listing import ParameterImpl +from ghidra.program.model.symbol import SourceType + +from isledecomp.compare.db import MatchInfo + +from lego_util.pdb_extraction import ( + FunctionSignature, + CppRegisterSymbol, + CppStackSymbol, +) +from lego_util.ghidra_helper import ( + get_ghidra_namespace, + get_ghidra_type, +) +from lego_util.exceptions import StackOffsetMismatchError + + + +logger = logging.getLogger(__name__) + + +class PdbFunctionWithGhidraObjects: + """A representation of a function from the PDB with each type replaced by a Ghidra type instance.""" + + def __init__( + self, + fpapi: "FlatProgramAPI", + match_info: "MatchInfo", + signature: "FunctionSignature", + ): + self.api = fpapi + self.match_info = match_info + self.signature = signature + + assert match_info.name is not None + colon_split = match_info.name.split("::") + self.name = colon_split.pop() + namespace_hierachy = colon_split + self.namespace = get_ghidra_namespace(fpapi, namespace_hierachy) + + self.return_type = get_ghidra_type(fpapi, signature.return_type) + self.arguments = [ + ParameterImpl( + f"param{index}", + get_ghidra_type(fpapi, type_name), + fpapi.getCurrentProgram(), + ) + for (index, type_name) in enumerate(signature.arglist) + ] + + @property + def call_type(self): + return self.signature.call_type + + @property + def stack_symbols(self): + return self.signature.stack_symbols + + def get_full_name(self) -> str: + return f"{self.namespace.getName()}::{self.name}" + + def format_proposed_change(self) -> str: + return ( + f"{self.return_type} {self.call_type} {self.get_full_name()}" + + f"({', '.join(self.signature.arglist)})" + ) + + def matches_ghidra_function(self, ghidra_function): # type: (Function) -> bool + """Checks whether this function declaration already matches the description in Ghidra""" + name_match = self.name == ghidra_function.getName(False) + namespace_match = self.namespace == ghidra_function.getParentNamespace() + return_type_match = self.return_type == ghidra_function.getReturnType() + # match arguments: decide if thiscall or not + thiscall_matches = ( + self.signature.call_type == ghidra_function.getCallingConventionName() + ) + + if thiscall_matches: + if self.signature.call_type == "__thiscall": + args_match = self._matches_thiscall_parameters(ghidra_function) + else: + args_match = self._matches_non_thiscall_parameters(ghidra_function) + else: + args_match = False + + logger.debug( + "Matches: namespace=%s name=%s return_type=%s thiscall=%s args=%s", + namespace_match, + name_match, + return_type_match, + thiscall_matches, + args_match, + ) + + return ( + name_match + and namespace_match + and return_type_match + and thiscall_matches + and args_match + ) + + def _matches_non_thiscall_parameters( + self, ghidra_function + ): # type: (Function) -> bool + return self._parameter_lists_match(ghidra_function.getParameters()) + + def _matches_thiscall_parameters(self, ghidra_function: "Function") -> bool: + ghidra_params = list(ghidra_function.getParameters()) + + # remove the `this` argument which we don't generate ourselves + ghidra_params.pop(0) + + return self._parameter_lists_match(ghidra_params) + + def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool: + if len(self.arguments) != len(ghidra_params): + logger.info("Mismatching argument count") + return False + + for this_arg, ghidra_arg in zip(self.arguments, ghidra_params): + # compare argument types + if this_arg.getDataType() != ghidra_arg.getDataType(): + logger.debug( + "Mismatching arg type: expected %s, found %s", + this_arg.getDataType(), + ghidra_arg.getDataType(), + ) + return False + # compare argument names + stack_match = self.get_matching_stack_symbol(ghidra_arg.getStackOffset()) + if stack_match is None: + logger.debug("Not found on stack: %s", ghidra_arg) + return False + # "__formal" is the placeholder for arguments without a name + if stack_match.name not in ["__formal", ghidra_arg.getName()]: + logger.debug( + "Argument name mismatch: expected %s, found %s", + stack_match.name, + ghidra_arg.getName(), + ) + return False + return True + + def overwrite_ghidra_function(self, ghidra_function): # type: (Function) -> None + """Replace the function declaration in Ghidra by the one derived from C++.""" + ghidra_function.setName(self.name, SourceType.USER_DEFINED) + ghidra_function.setParentNamespace(self.namespace) + ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED) + ghidra_function.setCallingConvention(self.call_type) + + ghidra_function.replaceParameters( + Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS, + True, + SourceType.USER_DEFINED, + self.arguments, + ) + + # When we set the parameters, Ghidra will generate the layout. + # Now we read them again and match them against the stack layout in the PDB, + # both to verify and to set the parameter names. + ghidra_parameters: "list[ghidra.program.model.listing.Parameter]" = ghidra_function.getParameters() # type: ignore + + # Try to add Ghidra function names + for param in ghidra_parameters: + if param.isStackVariable(): + self._rename_stack_parameter(param) + else: + if param.getName() == "this": + # 'this' parameters are auto-generated and cannot be changed + continue + + # TODO: Does this ever happen? + logger.warning("Unhandled register variable in %s", self.get_full_name) + continue + + # Old code for reference: + # + # register = param.getRegister().getName().lower() + # match = self.get_matching_register_symbol(register) + # if match is None: + # logger.error( + # "Could not match register parameter %s to known symbols %s", + # param, + # self.stack_symbols, + # ) + # continue + + def _rename_stack_parameter(self, param: "Parameter"): + match = self.get_matching_stack_symbol(param.getStackOffset()) + if match is None: + raise StackOffsetMismatchError( + f"Could not find a matching symbol at offset {param.getStackOffset()} in {self.get_full_name()}" + ) + + if param.getDataType() != get_ghidra_type(self.api, match.data_type): + logger.error( + "Type mismatch for parameter: %s in Ghidra, %s in PDB", param, match + ) + return + + param.setName(match.name, SourceType.USER_DEFINED) + + def get_matching_stack_symbol(self, stack_offset: int) -> "CppStackSymbol | None": + return next( + ( + symbol + for symbol in self.stack_symbols + if isinstance(symbol, CppStackSymbol) + and symbol.stack_offset == stack_offset + ), + None, + ) + + def get_matching_register_symbol(self, register: str) -> "CppRegisterSymbol | None": + return next( + ( + symbol + for symbol in self.stack_symbols + if isinstance(symbol, CppRegisterSymbol) and symbol.register == register + ), + None, + )