Refactor: Introduce submodules and reload remedy

This commit is contained in:
jonschz 2024-05-23 20:06:42 +02:00
parent 86ffbc4804
commit 17b101d8fc
3 changed files with 294 additions and 243 deletions

View File

@ -17,4 +17,6 @@ Since these scripts and its dependencies are written in Python 3, [Ghidrathon](h
## Development
- Type hints for Ghira (optional): Download a recent release from https://github.com/VDOO-Connected-Trust/ghidra-pyi-generator,
unpack it somewhere, and `pip install` that directory in this virtual environment. This provides types and headers for Python.
- Note that as of 2024-05-20 there is a [bug](https://github.com/mandiant/Ghidrathon/issues/103) in Ghidrathon v4.0.0: Changes in dependent scripts are not detected. If you modify a file that is imported by the script, you must restart Ghidra for the change to have any effect.
Be aware that some of these files contain errors - in particular, `from typing import overload` seems to be missing everywhere, leading to spurious type errors.
- Note that the imported modules persist across multiple runs of the script (see [here](https://github.com/mandiant/Ghidrathon/issues/103)).
If you indend to modify an imported library, you have to use `import importlib; importlib.reload(${library})` or restart Ghidra for your changes to have any effect.

View File

@ -9,6 +9,11 @@
# @menupath
# @toolbar
# pylint: disable=wrong-import-position,ungrouped-imports
# pylint: disable=undefined-variable # need to disable this one globally because pylint does not understand e.g. askYesNo()
import importlib
from dataclasses import dataclass, field
import sys
import logging
@ -16,14 +21,25 @@
import traceback
from typing import TYPE_CHECKING
from lego_util.exceptions import Lego1Exception
from lego_util.statistics import Statistics
# pylint: disable=undefined-variable # need to disable this one globally because pylint does not understand e.g. askYesNo()
if TYPE_CHECKING:
import ghidra
from lego_util.headers import * # pylint: disable=wildcard-import
def reload_module(module: str):
"""
Due to a a quirk in Jep (used by Ghidrathon), imported modules persist for the lifetime of the Ghidra process
and are not reloaded when relaunching the script. Therefore, in order to facilitate development
we force reload all our own modules at startup.
"""
importlib.reload(importlib.import_module(module))
reload_module("lego_util.statistics")
from lego_util.statistics import Statistics
logger = logging.getLogger(__name__)
@ -58,18 +74,7 @@ class Globals:
# In the latter case, only the C++ parser can be used.
setup_logging()
try:
# this one contains actual code
from lego_util.ghidra_helper import (
get_ghidra_namespace,
get_ghidra_type,
)
from ghidra.program.model.listing import Function, Parameter
from ghidra.program.flatapi import FlatProgramAPI
from ghidra.program.model.listing import ParameterImpl
from ghidra.program.model.listing import Function
from ghidra.program.model.symbol import SourceType
from ghidra.util.exception import CancelledException
GLOBALS.make_changes = askYesNo(
@ -82,11 +87,13 @@ class Globals:
)
GLOBALS.running_from_ghidra = True
except ImportError:
except ImportError as importError:
logger.error(
"Failed to import Ghidra functions, doing a dry run for the source code parser. "
"Has this script been launched from Ghidra?"
)
logger.debug("Precise import error:", exc_info=importError)
GLOBALS.running_from_ghidra = False
CancelledException = None
@ -102,222 +109,18 @@ def add_python_path(path: str):
sys.path.insert(1, str(venv_path))
class PdbFunctionWithGhidraObjects:
"""A representation of a function from the PDB with each type replaced by a Ghidra type instance."""
def __init__(
self,
fpapi: "FlatProgramAPI",
match_info: "MatchInfo",
signature: "FunctionSignature",
):
self.api = fpapi
self.match_info = match_info
self.signature = signature
assert match_info.name is not None
colon_split = match_info.name.split("::")
self.name = colon_split.pop()
namespace_hierachy = colon_split
self.namespace = get_ghidra_namespace(fpapi, namespace_hierachy)
self.return_type = get_ghidra_type(fpapi, signature.return_type)
self.arguments = [
ParameterImpl(
f"param{index}",
get_ghidra_type(fpapi, type_name),
fpapi.getCurrentProgram(),
)
for (index, type_name) in enumerate(signature.arglist)
]
@property
def call_type(self):
return self.signature.call_type
@property
def stack_symbols(self):
return self.signature.stack_symbols
def get_full_name(self) -> str:
return f"{self.namespace.getName()}::{self.name}"
def format_proposed_change(self) -> str:
return (
f"{self.return_type} {self.call_type} {self.get_full_name()}"
+ f"({', '.join(self.signature.arglist)})"
)
def matches_ghidra_function(self, ghidra_function): # type: (Function) -> bool
"""Checks whether this function declaration already matches the description in Ghidra"""
name_match = self.name == ghidra_function.getName(False)
namespace_match = self.namespace == ghidra_function.getParentNamespace()
return_type_match = self.return_type == ghidra_function.getReturnType()
# match arguments: decide if thiscall or not
thiscall_matches = (
self.signature.call_type == ghidra_function.getCallingConventionName()
)
if thiscall_matches:
if self.signature.call_type == "__thiscall":
args_match = self._matches_thiscall_parameters(ghidra_function)
else:
args_match = self._matches_non_thiscall_parameters(ghidra_function)
else:
args_match = False
logger.debug(
"Matches: namespace=%s name=%s return_type=%s thiscall=%s args=%s",
namespace_match,
name_match,
return_type_match,
thiscall_matches,
args_match,
)
return (
name_match
and namespace_match
and return_type_match
and thiscall_matches
and args_match
)
def _matches_non_thiscall_parameters(
self, ghidra_function
): # type: (Function) -> bool
return self._parameter_lists_match(ghidra_function.getParameters())
def _matches_thiscall_parameters(self, ghidra_function: "Function") -> bool:
ghidra_params = list(ghidra_function.getParameters())
# remove the `this` argument which we don't generate ourselves
ghidra_params.pop(0)
return self._parameter_lists_match(ghidra_params)
def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool:
if len(self.arguments) != len(ghidra_params):
logger.info("Mismatching argument count")
return False
for this_arg, ghidra_arg in zip(self.arguments, ghidra_params):
# compare argument types
if this_arg.getDataType() != ghidra_arg.getDataType():
logger.debug(
"Mismatching arg type: expected %s, found %s",
this_arg.getDataType(),
ghidra_arg.getDataType(),
)
return False
# compare argument names
stack_match = self.get_matching_stack_symbol(ghidra_arg.getStackOffset())
if stack_match is None:
logger.debug("Not found on stack: %s", ghidra_arg)
return False
# "__formal" is the placeholder for arguments without a name
if stack_match.name not in ["__formal", ghidra_arg.getName()]:
logger.debug(
"Argument name mismatch: expected %s, found %s",
stack_match.name,
ghidra_arg.getName(),
)
return False
return True
def overwrite_ghidra_function(self, ghidra_function): # type: (Function) -> None
"""Replace the function declaration in Ghidra by the one derived from C++."""
ghidra_function.setName(self.name, SourceType.USER_DEFINED)
ghidra_function.setParentNamespace(self.namespace)
ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED)
ghidra_function.setCallingConvention(self.call_type)
ghidra_function.replaceParameters(
Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS,
True,
SourceType.USER_DEFINED,
self.arguments,
)
# When we set the parameters, Ghidra will generate the layout.
# Now we read them again and match them against the stack layout in the PDB,
# both to verify and to set the parameter names.
ghidra_parameters: "list[ghidra.program.model.listing.Parameter]" = ghidra_function.getParameters() # type: ignore
# Try to add Ghidra function names
for param in ghidra_parameters:
if param.isStackVariable():
self._rename_stack_parameter(param)
else:
if param.getName() == "this":
# 'this' parameters are auto-generated and cannot be changed
continue
# TODO: Does this ever happen?
logger.warning("Unhandled register variable in %s", self.get_full_name)
continue
# Old code for reference:
#
# register = param.getRegister().getName().lower()
# match = self.get_matching_register_symbol(register)
# if match is None:
# logger.error(
# "Could not match register parameter %s to known symbols %s",
# param,
# self.stack_symbols,
# )
# continue
def _rename_stack_parameter(self, param: "Parameter"):
match = self.get_matching_stack_symbol(param.getStackOffset())
if match is None:
raise StackOffsetMismatchError(
f"Could not find a matching symbol at offset {param.getStackOffset()} in {self.get_full_name()}"
)
if param.getDataType() != get_ghidra_type(self.api, match.data_type):
logger.error(
"Type mismatch for parameter: %s in Ghidra, %s in PDB", param, match
)
return
param.setName(match.name, SourceType.USER_DEFINED)
def get_matching_stack_symbol(self, stack_offset: int) -> "CppStackSymbol | None":
return next(
(
symbol
for symbol in self.stack_symbols
if isinstance(symbol, CppStackSymbol)
and symbol.stack_offset == stack_offset
),
None,
)
def get_matching_register_symbol(self, register: str) -> "CppRegisterSymbol | None":
return next(
(
symbol
for symbol in self.stack_symbols
if isinstance(symbol, CppRegisterSymbol) and symbol.register == register
),
None,
)
def handle_function_in_ghidra(match_info: "MatchInfo", signature: "FunctionSignature"):
if not GLOBALS.running_from_ghidra:
return
def migrate_function_to_ghidra(
api: "FlatProgramAPI",
match_info: "MatchInfo",
signature: "FunctionSignature"
):
hex_original_address = f"{match_info.orig_addr:x}"
# Find the Ghidra function at that address
ghidra_address = getAddressFactory().getAddress(hex_original_address) # type: ignore
ghidra_address = getAddressFactory().getAddress(hex_original_address)
fpapi = FlatProgramAPI(currentProgram()) # type: ignore
typed_pdb_function = PdbFunctionWithGhidraObjects(fpapi, match_info, signature)
typed_pdb_function = PdbFunctionWithGhidraObjects(api, match_info, signature)
if not GLOBALS.make_changes:
return
@ -354,7 +157,7 @@ def handle_function_in_ghidra(match_info: "MatchInfo", signature: "FunctionSigna
logger.critical("User quit, terminating")
raise SystemExit(1)
# logger.info("Modifying function %s at 0x%s", cpp_function.full_name(), address)
logger.debug("Modifying function %s at 0x%s", typed_pdb_function.get_full_name(), hex_original_address)
typed_pdb_function.overwrite_ghidra_function(ghidra_function)
@ -362,16 +165,22 @@ def handle_function_in_ghidra(match_info: "MatchInfo", signature: "FunctionSigna
if GLOBALS.prompt_before_changes:
# Add a prompt so we can verify the result immediately
askChoice("", "Click 'OK' to continue", ["OK"], "OK")
askChoice("Continue", "Click 'OK' to continue", ["OK"], "OK")
def handle_function_list(isle_compare: "IsleCompare"):
def process_functions(isle_compare: "IsleCompare"):
# try to acquire matched functions
migration = PdbExtractionForGhidraMigration(isle_compare)
func_signatures = migration.get_function_list()
if not GLOBALS.running_from_ghidra:
logger.info("Completed the dry run outside Ghidra.")
return
fpapi = FlatProgramAPI(currentProgram())
for match_info, signature in func_signatures:
try:
handle_function_in_ghidra(match_info, signature)
migrate_function_to_ghidra(fpapi, match_info, signature)
GLOBALS.statistics.successes += 1
except Lego1Exception as e:
log_and_track_failure(e)
@ -415,9 +224,10 @@ def main():
logger.info("Comparison complete.")
try:
handle_function_list(isle_compare)
process_functions(isle_compare)
finally:
GLOBALS.statistics.log()
if GLOBALS.running_from_ghidra:
GLOBALS.statistics.log()
logger.info("Done")
@ -425,24 +235,31 @@ def main():
# sys.path is not reset after running the script, so we should restore it
sys_path_backup = sys.path.copy()
try:
add_python_path(
".venv/Lib/site-packages"
) # make modules installed in the venv available in Ghidra
add_python_path(
"tools/isledecomp"
) # needed when isledecomp is installed in editable mode in the venv
# make modules installed in the venv available in Ghidra
add_python_path(".venv/Lib/site-packages")
# This one is needed when isledecomp is installed in editable mode in the venv
add_python_path("tools/isledecomp")
import setuptools # pylint: disable=unused-import # required to fix a distutils issue in Python 3.12
reload_module("isledecomp")
from isledecomp import Bin
reload_module("isledecomp.compare")
from isledecomp.compare import Compare as IsleCompare
reload_module("isledecomp.compare.db")
from isledecomp.compare.db import MatchInfo
from lego_util.pdb_extraction import ( # pylint: disable=ungrouped-imports # these must be imported
reload_module("lego_util.exceptions")
from lego_util.exceptions import Lego1Exception
reload_module("lego_util.pdb_extraction")
from lego_util.pdb_extraction import (
PdbExtractionForGhidraMigration,
FunctionSignature,
CppRegisterSymbol,
CppStackSymbol,
)
from lego_util.exceptions import StackOffsetMismatchError
if GLOBALS.running_from_ghidra:
reload_module("lego_util.pdb_to_ghidra")
from lego_util.pdb_to_ghidra import PdbFunctionWithGhidraObjects
if __name__ == "__main__":
main()

View File

@ -0,0 +1,232 @@
# This file can only be imported successfully when run from Ghidra using Ghidrathon.
# Disable spurious warnings in vscode / pylance
# pyright: reportMissingModuleSource=false
import logging
from ghidra.program.model.listing import Function, Parameter
from ghidra.program.flatapi import FlatProgramAPI
from ghidra.program.model.listing import ParameterImpl
from ghidra.program.model.symbol import SourceType
from isledecomp.compare.db import MatchInfo
from lego_util.pdb_extraction import (
FunctionSignature,
CppRegisterSymbol,
CppStackSymbol,
)
from lego_util.ghidra_helper import (
get_ghidra_namespace,
get_ghidra_type,
)
from lego_util.exceptions import StackOffsetMismatchError
logger = logging.getLogger(__name__)
class PdbFunctionWithGhidraObjects:
"""A representation of a function from the PDB with each type replaced by a Ghidra type instance."""
def __init__(
self,
fpapi: "FlatProgramAPI",
match_info: "MatchInfo",
signature: "FunctionSignature",
):
self.api = fpapi
self.match_info = match_info
self.signature = signature
assert match_info.name is not None
colon_split = match_info.name.split("::")
self.name = colon_split.pop()
namespace_hierachy = colon_split
self.namespace = get_ghidra_namespace(fpapi, namespace_hierachy)
self.return_type = get_ghidra_type(fpapi, signature.return_type)
self.arguments = [
ParameterImpl(
f"param{index}",
get_ghidra_type(fpapi, type_name),
fpapi.getCurrentProgram(),
)
for (index, type_name) in enumerate(signature.arglist)
]
@property
def call_type(self):
return self.signature.call_type
@property
def stack_symbols(self):
return self.signature.stack_symbols
def get_full_name(self) -> str:
return f"{self.namespace.getName()}::{self.name}"
def format_proposed_change(self) -> str:
return (
f"{self.return_type} {self.call_type} {self.get_full_name()}"
+ f"({', '.join(self.signature.arglist)})"
)
def matches_ghidra_function(self, ghidra_function): # type: (Function) -> bool
"""Checks whether this function declaration already matches the description in Ghidra"""
name_match = self.name == ghidra_function.getName(False)
namespace_match = self.namespace == ghidra_function.getParentNamespace()
return_type_match = self.return_type == ghidra_function.getReturnType()
# match arguments: decide if thiscall or not
thiscall_matches = (
self.signature.call_type == ghidra_function.getCallingConventionName()
)
if thiscall_matches:
if self.signature.call_type == "__thiscall":
args_match = self._matches_thiscall_parameters(ghidra_function)
else:
args_match = self._matches_non_thiscall_parameters(ghidra_function)
else:
args_match = False
logger.debug(
"Matches: namespace=%s name=%s return_type=%s thiscall=%s args=%s",
namespace_match,
name_match,
return_type_match,
thiscall_matches,
args_match,
)
return (
name_match
and namespace_match
and return_type_match
and thiscall_matches
and args_match
)
def _matches_non_thiscall_parameters(
self, ghidra_function
): # type: (Function) -> bool
return self._parameter_lists_match(ghidra_function.getParameters())
def _matches_thiscall_parameters(self, ghidra_function: "Function") -> bool:
ghidra_params = list(ghidra_function.getParameters())
# remove the `this` argument which we don't generate ourselves
ghidra_params.pop(0)
return self._parameter_lists_match(ghidra_params)
def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool:
if len(self.arguments) != len(ghidra_params):
logger.info("Mismatching argument count")
return False
for this_arg, ghidra_arg in zip(self.arguments, ghidra_params):
# compare argument types
if this_arg.getDataType() != ghidra_arg.getDataType():
logger.debug(
"Mismatching arg type: expected %s, found %s",
this_arg.getDataType(),
ghidra_arg.getDataType(),
)
return False
# compare argument names
stack_match = self.get_matching_stack_symbol(ghidra_arg.getStackOffset())
if stack_match is None:
logger.debug("Not found on stack: %s", ghidra_arg)
return False
# "__formal" is the placeholder for arguments without a name
if stack_match.name not in ["__formal", ghidra_arg.getName()]:
logger.debug(
"Argument name mismatch: expected %s, found %s",
stack_match.name,
ghidra_arg.getName(),
)
return False
return True
def overwrite_ghidra_function(self, ghidra_function): # type: (Function) -> None
"""Replace the function declaration in Ghidra by the one derived from C++."""
ghidra_function.setName(self.name, SourceType.USER_DEFINED)
ghidra_function.setParentNamespace(self.namespace)
ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED)
ghidra_function.setCallingConvention(self.call_type)
ghidra_function.replaceParameters(
Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS,
True,
SourceType.USER_DEFINED,
self.arguments,
)
# When we set the parameters, Ghidra will generate the layout.
# Now we read them again and match them against the stack layout in the PDB,
# both to verify and to set the parameter names.
ghidra_parameters: "list[ghidra.program.model.listing.Parameter]" = ghidra_function.getParameters() # type: ignore
# Try to add Ghidra function names
for param in ghidra_parameters:
if param.isStackVariable():
self._rename_stack_parameter(param)
else:
if param.getName() == "this":
# 'this' parameters are auto-generated and cannot be changed
continue
# TODO: Does this ever happen?
logger.warning("Unhandled register variable in %s", self.get_full_name)
continue
# Old code for reference:
#
# register = param.getRegister().getName().lower()
# match = self.get_matching_register_symbol(register)
# if match is None:
# logger.error(
# "Could not match register parameter %s to known symbols %s",
# param,
# self.stack_symbols,
# )
# continue
def _rename_stack_parameter(self, param: "Parameter"):
match = self.get_matching_stack_symbol(param.getStackOffset())
if match is None:
raise StackOffsetMismatchError(
f"Could not find a matching symbol at offset {param.getStackOffset()} in {self.get_full_name()}"
)
if param.getDataType() != get_ghidra_type(self.api, match.data_type):
logger.error(
"Type mismatch for parameter: %s in Ghidra, %s in PDB", param, match
)
return
param.setName(match.name, SourceType.USER_DEFINED)
def get_matching_stack_symbol(self, stack_offset: int) -> "CppStackSymbol | None":
return next(
(
symbol
for symbol in self.stack_symbols
if isinstance(symbol, CppStackSymbol)
and symbol.stack_offset == stack_offset
),
None,
)
def get_matching_register_symbol(self, register: str) -> "CppRegisterSymbol | None":
return next(
(
symbol
for symbol in self.stack_symbols
if isinstance(symbol, CppRegisterSymbol) and symbol.register == register
),
None,
)