diff --git a/.gitignore b/.gitignore index 7e16a6ce..78fe1384 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ LEGO1.DLL LEGO1PROGRESS.* ISLEPROGRESS.* *.pyc +*$py.class \ No newline at end of file diff --git a/tools/ghidra_scripts/README.md b/tools/ghidra_scripts/README.md new file mode 100644 index 00000000..95dd5707 --- /dev/null +++ b/tools/ghidra_scripts/README.md @@ -0,0 +1,12 @@ +# Ghidra Scripts + +## Setup +- In Ghidra, _Open Window -> Script Manager_. +- Click the _Manage Script Directories_ button on the top right. +- Click the _Add_ button and select this file's parent directory. +- Close the window and click the _Refresh_ button. +- This script should now be available under the folder _LEGO1_. + +## Development +- Type hints for Ghira (optional): Download a recent release from https://github.com/VDOO-Connected-Trust/ghidra-pyi-generator, + unpack it somewhere, and `pip install` that directory in this virtual environment. This provides types and headers for Python. diff --git a/tools/ghidra_scripts/SyncFunctionsToGhidra.py b/tools/ghidra_scripts/SyncFunctionsToGhidra.py new file mode 100644 index 00000000..5574c3e9 --- /dev/null +++ b/tools/ghidra_scripts/SyncFunctionsToGhidra.py @@ -0,0 +1,300 @@ +# Synchronised the function signatures of LEGO1.dll to Ghidra. +# At startup there will be several prompts for different modes, +# including a read-only / dry run mode. + +# @author J. Schulz +# @category LEGO1 +# @keybinding +# @menupath +# @toolbar + + +# Disable spurious warnings in vscode / pylance +# pyright: reportMissingModuleSource=false + +import sys +import os +import re +import traceback +import logging + +from lego_util.cpp_parser import ( + CppFunctionDeclaration, + function_regex, + class_regex, + struct_regex, + namespace_regex, +) +from lego_util.file_helper import iterate_dir +from lego_util.exceptions import ( + Lego1Exception, + NamespaceNotFoundInGhidraError, + TypeNotFoundInGhidraError, + FunctionNotFoundInGhidraError, +) + +# # no effect when no Ghidra is used +# READ_ONLY = False +# # READ_ONLY = True + + +# Type annotations are only available in Python 3.5 or later +if sys.version_info.major > 2: + from typing import TYPE_CHECKING, TypeVar + + if TYPE_CHECKING: + from ghidra.program.model.address import Address, AddressFactory + from ghidra.program.model.listing import Program + from ghidra.program.model.data import DataType + from ghidra.program.model.symbol import Namespace + from ghidra.app.script import GhidraScript + from ghidra.app.script import GhidraState + + # Global stubs, Python 2 and 3 compatible + + def _get_state(): # type: () -> GhidraState + return None # type: ignore + + state = _get_state() + + def getDataTypes(name): # type: (str) -> list[DataType] + return # type: ignore + + def getCurrentProgram(): # type: () -> Program + return # type: ignore + + def getFunctionAt(entryPoint): # type: (Address) -> Function + return # type: ignore + + def getAddressFactory(): # type: () -> AddressFactory + return # type: ignore + + def getNamespace(parent, namespaceName): # type: (Namespace, str) -> Namespace + return # type: ignore + + def askYesNo(title, message): # type: (str, str) -> bool + return # type: ignore + + T = TypeVar("T") + + def askChoice( + title, message, choices, defaultValue + ): # type: (str, str, list[T], T) -> T + return # type: ignore + + +# This script can be run both from Ghidra and as a standalone. +# In the latter case, only the C++ parser can be used. +try: + from ghidra.program.model.listing import Function + from ghidra.program.flatapi import FlatProgramAPI + + from lego_util.ghidra_helper import CppFunctionWithGhidraTypes + + # This is needed for Ghidra API calls in submodules + API = FlatProgramAPI(state.getCurrentProgram()) + + MAKE_CHANGES = askYesNo( + "Make changes?", "Select 'Yes' to apply changes, select 'No' to do a dry run." + ) + + if MAKE_CHANGES: + PROMPT_BEFORE_CHANGE = askYesNo( + "Prompt before changes?", "Should each change be confirmed by a prompt?" + ) + else: + # for the linter, has no effect anyway + PROMPT_BEFORE_CHANGE = True + + RUNNING_FROM_GHIDRA = True +except ImportError: + RUNNING_FROM_GHIDRA = False + MAKE_CHANGES = False + + +CLASSES_AND_STRUCTS = set() # type: set[str] +NAMESPACES = set() # type: set[str] + +SUCCESSES = 0 +FAILURES = {} # type: dict[str, int] +KNOWN_MISSING_TYPES = {} # type: dict[str, int] +KNOWN_MISSING_NAMESPACES = set() # type: set[str] + +FUNCTIONS_CHANGED = 0 + + +def main(): + logging.basicConfig( + format="%(levelname)-8s %(message)s", stream=sys.stdout, level=logging.INFO + ) + if not RUNNING_FROM_GHIDRA: + logging.error( + "Failed to import Ghidra functions, doing a dry run for the source code parser. " + "Has this script been launched from Ghidra?" + ) + # navigate to this repository's root and then down to the LEGO1 source + root_dir = os.path.join(os.path.dirname(__file__), "..", "..", "LEGO1") + + try: + # Collect classes and structs first + iterate_dir(root_dir, search_for_classes_and_structs) + + # Now do the real work + iterate_dir(root_dir, search_and_process_functions) + finally: + # output statistics even when aborting + missing_type_list = [ + "%s (%d)" % entry + for entry in sorted( + KNOWN_MISSING_TYPES.items(), key=lambda x: x[1], reverse=True + ) + ] + + logging.info( + "Missing types: (with number of occurences): %s", + ", ".join(missing_type_list), + ) + logging.info("Successes: %d", SUCCESSES) + logging.info("Failures: %s", FAILURES) + logging.info("Functions changed: %d", FUNCTIONS_CHANGED) + + +def log_and_track_failure( + file_path, error, unexpected=False +): # type: (str, Exception, bool) -> None + error_type_name = error.__class__.__name__ + FAILURES[error_type_name] = FAILURES.setdefault(error_type_name, 0) + 1 + + if isinstance(error, TypeNotFoundInGhidraError): + missing_type = error.args[0] + current_count = KNOWN_MISSING_TYPES.setdefault(missing_type, 0) + KNOWN_MISSING_TYPES[missing_type] = current_count + 1 + if current_count > 0: + # Log each missing type only once to reduce log noise + return + + if isinstance(error, NamespaceNotFoundInGhidraError): + namespace = error.get_namespace_str() + if namespace in KNOWN_MISSING_NAMESPACES: + # Log each missing namespace only once to reduce log noise + return + + KNOWN_MISSING_NAMESPACES.add(namespace) + + logging.error( + "%s%s: %s", + "Unexpected error in " if unexpected else "", + os.path.basename(file_path), + error, + ) + + +def handle_function(lines, startIndex, address): # type: (str, int, str) -> None + global FUNCTIONS_CHANGED + + # Parse the C++ function + while re.match(r"\s*//", lines[startIndex:]): + startIndex = lines.find("\n", startIndex + 1) + cpp_function = CppFunctionDeclaration(lines, startIndex, CLASSES_AND_STRUCTS) + + if cpp_function.return_type in CLASSES_AND_STRUCTS: + # edge case handling - Ghidra does not understand what happens under the hood. + # These must be set manually + logging.error( + "Unimplemented edge case at 0x%s: Return value is a non-referenced struct or class: %s", + address, + cpp_function, + ) + return + + if not RUNNING_FROM_GHIDRA: + return + + # Find the Ghidra function at that address + ghidra_address = getAddressFactory().getAddress(address) + ghidra_function = getFunctionAt(ghidra_address) + if ghidra_function is None: + raise FunctionNotFoundInGhidraError(address) + + # Convert the C++ data types to Ghidra data types + typed_cpp_function = CppFunctionWithGhidraTypes(API, cpp_function) + + if typed_cpp_function.matches_ghidra_function(ghidra_function): + logging.debug( + "Skipping function '%s', matches already", cpp_function.full_name() + ) + return + + if not MAKE_CHANGES: + return + + # Navigate Ghidra to the current function + state.setCurrentAddress(ghidra_address) + + if PROMPT_BEFORE_CHANGE: + choice = askChoice( + "Change function?", + "Change to %s" % cpp_function, + ["Yes", "No", "Abort"], + "Yes", + ) + if choice == "No": + return + if choice != "Yes": + logging.critical("User quit, terminating") + raise SystemExit(1) + + logging.info("Modifying function %s at 0x%s", cpp_function.full_name(), address) + + typed_cpp_function.overwrite_ghidra_function(ghidra_function) + + FUNCTIONS_CHANGED += 1 + + if PROMPT_BEFORE_CHANGE: + # Add a prompt so we can verify the result immediately + askChoice("", "Click 'OK' to continue", ["OK"], "OK") + + +def search_for_classes_and_structs(header_file): # type: (str) -> None + global CLASSES_AND_STRUCTS, NAMESPACES + + if not (header_file.endswith(".h") or header_file.endswith(".cpp")): + return + try: + with open(header_file) as infile: + headers = infile.read() + except Exception: + logging.error( + "Error handling header file: %s\n%s", header_file, traceback.format_exc() + ) + return + + CLASSES_AND_STRUCTS = CLASSES_AND_STRUCTS.union(class_regex.findall(headers)) + CLASSES_AND_STRUCTS = CLASSES_AND_STRUCTS.union(struct_regex.findall(headers)) + NAMESPACES = NAMESPACES.union(namespace_regex.findall(headers)) + + +def search_and_process_functions(path): # type: (str) -> None + global SUCCESSES + if not path.endswith(".cpp"): + return + + with open(path, "r") as file: + lines = file.read() + + # search for '// FUNCTION: LEGO1 0x[...]' + for match in function_regex.finditer(lines): + next_line_index = lines.find("\n", match.end()) + 1 + try: + handle_function(lines, next_line_index, match.groups()[0]) + SUCCESSES += 1 + except Lego1Exception as e: + log_and_track_failure(path, e) + + except Exception as e: + log_and_track_failure(path, e, unexpected=True) + logging.error(traceback.format_exc()) + + +if __name__ == "__main__": + main() diff --git a/tools/ghidra_scripts/lego_util/__init__.py b/tools/ghidra_scripts/lego_util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/ghidra_scripts/lego_util/cpp_parser.py b/tools/ghidra_scripts/lego_util/cpp_parser.py new file mode 100644 index 00000000..d1d7caf4 --- /dev/null +++ b/tools/ghidra_scripts/lego_util/cpp_parser.py @@ -0,0 +1,140 @@ +import re + +from lego_util.exceptions import ( + UnsupportedCppSyntaxError, + CppUnknownClassOrNamespaceError, +) + +function_regex = re.compile(r"\s*// FUNCTION: LEGO1 0x(\w{8})") + +class_regex = re.compile(r"\n\s*class\s(\w+)") + +struct_regex = re.compile(r"\n\s*struct\s(\w+)") + +namespace_regex = re.compile(r"\n\s*namespace\s(\w+)") + + +class CppFunctionDeclaration: + """ + A rudimentary parser for C++ function signatures in LEGO1. + Assumes that the C++ code has been formatted to some degree. + """ + + def __init__( + self, fn, start_index, classes_and_structs + ): # type: (CppFunctionDeclaration, str, int, set[str]) -> None + first_part_str, second_part = self._split_off_declaration_and_arguments( + fn[start_index:] + ) + + try: + first_part = first_part_str.split(" ") + full_function_name = first_part.pop() + colon_split = full_function_name.split("::") + self.name = colon_split.pop() + self.namespace_hierachy = colon_split + + if first_part: + while True: + # desired failure if we only get keywords and no return type + self.return_type = first_part.pop(0) + if self.return_type not in ["const", "inline"]: + break + else: + # most likely a constructor or destructor + assert self.namespace_hierachy is not None, ( + "Unhandled function without return type or namespace: " + fn + ) + if self.name.startswith("~"): + self.return_type = "void" + else: + self.return_type = self.name + "*" + + # evaluate if we belong to a class, assume __thiscall + self.class_name = None + if self.namespace_hierachy: + bottom_level_namespace = self.namespace_hierachy[-1] + if bottom_level_namespace in classes_and_structs: + self.class_name = bottom_level_namespace + else: + raise CppUnknownClassOrNamespaceError(bottom_level_namespace) + + # don't add a `this` argument, let Ghidra handle that + self.flags = first_part + if second_part.strip(): + self.arguments = [ + self._parse_argument(i, x) + for i, x in enumerate(second_part.split(",")) + ] + else: + self.arguments = [] + + except UnsupportedCppSyntaxError as e: + raise UnsupportedCppSyntaxError( + "%s. In: '%s(%s)'" % (e.args[0], first_part_str, second_part) + ) + + def __str__(self): + flags = " ".join(self.flags) + full_name = self.full_name() + args = ["%s %s" % pair for pair in self.arguments] + if self.class_name: + # add the "this" argument to the output + args = [("%s* this" % self.class_name)] + args + return "%s __thiscall %s%s(%s)" % ( + self.return_type, + flags, + full_name, + ", ".join(args), + ) + + return "%s %s%s(%s)" % (self.return_type, flags, full_name, ", ".join(args)) + + def full_name(self): + return "::".join(self.namespace_hierachy + [self.name]) + + def _parse_argument( + self, index, argument_str + ): # type: (int, str) -> tuple[str, str] + """Returns: (type, name)""" + # Cleanup, handle `const` + split = (x.strip() for x in argument_str.split(" ")) + filtered = [x for x in split if len(x) > 0 and x.lower() != "const"] + + if len(filtered) == 0: + raise UnsupportedCppSyntaxError( + "Expected more arguments: '%s'" % argument_str.strip() + ) + if len(filtered) == 1: + # unnamed argument + return (filtered[0], "param%d" % (index + 1)) + if len(filtered) == 2: + return (filtered[0], filtered[1]) + + raise UnsupportedCppSyntaxError( + "Unsupported argument syntax: '%s'" % argument_str.strip() + ) + + def _split_off_declaration_and_arguments( + self, fn + ): # type: (str) -> tuple[str, str] + # handle `unsigned` in arguments and result + fn = fn.replace("unsigned ", "u") + first_paren = fn.find("(") + assert first_paren >= 0, "No opening parenthesis found in function '%s'" % fn + + paren_stack = 1 + close_paren = first_paren + while paren_stack > 0: + # In case of unmatched parentheses we run into an IndexError, + # which is expected behaviour + close_paren += 1 + if fn[close_paren] == "(": + paren_stack += 1 + elif fn[close_paren] == ")": + paren_stack -= 1 + + return ( + fn[:first_paren].replace("\n", ""), + fn[first_paren + 1 : close_paren].replace("\n", ""), + ) diff --git a/tools/ghidra_scripts/lego_util/exceptions.py b/tools/ghidra_scripts/lego_util/exceptions.py new file mode 100644 index 00000000..bbe6e52d --- /dev/null +++ b/tools/ghidra_scripts/lego_util/exceptions.py @@ -0,0 +1,38 @@ +class Lego1Exception(Exception): + pass + + +class TypeNotFoundInGhidraError(Lego1Exception): + def __str__(self): + return "Type not found in Ghidra: %s" % self.args[0] + + +class NamespaceNotFoundInGhidraError(Lego1Exception): + def __init__(self, namespaceHierachy): # type: (list[str]) -> None + super(NamespaceNotFoundInGhidraError, self).__init__(namespaceHierachy) + + def get_namespace_str(self): # type: () -> str + return "::".join(self.args[0]) + + def __str__(self): + return "Class or namespace not found in Ghidra: %s" % self.get_namespace_str() + + +class FunctionNotFoundInGhidraError(Lego1Exception): + def __str__(self): + return "Function not found in Ghidra at %s" % self.args[0] + + +class MultipleTypesFoundInGhidraError(Lego1Exception): + def __str__(self): + return "Found multiple types matching '%s' in Ghidra: %s" % self.args + + +class UnsupportedCppSyntaxError(Lego1Exception): + def __str__(self): + return "C++ syntax currently not supported in the parser: %s" % self.args[0] + + +class CppUnknownClassOrNamespaceError(Lego1Exception): + def __str__(self): + return "'%s' is neither a known class nor namespace" % self.args[0] diff --git a/tools/ghidra_scripts/lego_util/file_helper.py b/tools/ghidra_scripts/lego_util/file_helper.py new file mode 100644 index 00000000..986c9223 --- /dev/null +++ b/tools/ghidra_scripts/lego_util/file_helper.py @@ -0,0 +1,14 @@ +import os +import sys + +if sys.version_info.major > 2: + from typing import Callable + + +def iterate_dir(path, file_callback): # type: (str, Callable[[str], None]) -> None + for file_or_dir_name in os.listdir(path): # pathlib not supported + child_path = os.path.join(path, file_or_dir_name) + if os.path.isdir(child_path): + iterate_dir(child_path, file_callback) + else: + file_callback(child_path) diff --git a/tools/ghidra_scripts/lego_util/ghidra_helper.py b/tools/ghidra_scripts/lego_util/ghidra_helper.py new file mode 100644 index 00000000..05283995 --- /dev/null +++ b/tools/ghidra_scripts/lego_util/ghidra_helper.py @@ -0,0 +1,173 @@ +import logging +import sys +import re + +from lego_util.exceptions import ( + NamespaceNotFoundInGhidraError, + TypeNotFoundInGhidraError, + MultipleTypesFoundInGhidraError, +) +from lego_util.cpp_parser import CppFunctionDeclaration + +# Disable spurious warnings in vscode / pylance +# pyright: reportMissingModuleSource=false + +from ghidra.program.model.data import PointerDataType +from ghidra.program.model.data import DataTypeConflictHandler +from ghidra.program.model.listing import ParameterImpl +from ghidra.program.model.listing import Function +from ghidra.program.model.symbol import SourceType + +# Type annotations are only available in Python 3.5 or later +if sys.version_info.major > 2: + from typing import TYPE_CHECKING + + if TYPE_CHECKING: + from ghidra.program.flatapi import FlatProgramAPI + from ghidra.program.model.data import DataType + from ghidra.program.model.symbol import Namespace + from ghidra.program.model.listing import Parameter + + +def get_ghidra_type(api, type_name): # type: (FlatProgramAPI, str) -> DataType + """ + Searches for the type named `typeName` in Ghidra. + + Raises: + NotFoundInGhidraError: + """ + + # references to pointers + type_name = type_name.replace("&", " *") + # handle reference spacing (void* -> void *) + type_name = re.sub(r"(? DataType + data_type = PointerDataType(pointee) + data_type.setCategoryPath(pointee.categoryPath) + api.getCurrentProgram().getDataTypeManager().addDataType( + data_type, DataTypeConflictHandler.KEEP_HANDLER + ) + logging.info("Created new pointer type %s", data_type) + return data_type + + +def get_ghidra_namespace( + api, namespace_hierachy +): # type: (FlatProgramAPI, list[str]) -> Namespace + namespace = api.getCurrentProgram().getGlobalNamespace() + for part in namespace_hierachy: + namespace = api.getNamespace(namespace, part) + if namespace is None: + raise NamespaceNotFoundInGhidraError(namespace_hierachy) + return namespace + + +class CppFunctionWithGhidraTypes(object): + """Collects the matching Ghidra entities for a C++ function declaration.""" + + def __init__( + self, fpapi, cpp_fn_decl + ): # type: (FlatProgramAPI, CppFunctionDeclaration) -> None + self.name = cpp_fn_decl.name + self.class_name = cpp_fn_decl.class_name + self.return_type = get_ghidra_type(fpapi, cpp_fn_decl.return_type) + self.arguments = [ + ParameterImpl( + name, get_ghidra_type(fpapi, type_name), fpapi.getCurrentProgram() + ) + for (type_name, name) in cpp_fn_decl.arguments + ] + self.namespace = get_ghidra_namespace(fpapi, cpp_fn_decl.namespace_hierachy) + + def matches_ghidra_function(self, ghidra_function): # type: (Function) -> bool + """Checks whether this function declaration already matches the description in Ghidra""" + name_match = self.name == ghidra_function.getName(False) + namespace_match = self.namespace == ghidra_function.getParentNamespace() + return_type_match = self.return_type == ghidra_function.getReturnType() + # match arguments: decide if thiscall or not + thiscall_matches = (self.class_name is not None) == ( + ghidra_function.getCallingConventionName() == "__thiscall" + ) + + if thiscall_matches: + if self.class_name is not None: + args_match = self._matches_thiscall_parameters(ghidra_function) + else: + args_match = self._matches_non_thiscall_parameters(ghidra_function) + else: + args_match = False + + logging.debug( + "Matches: namespace=%s name=%s return_type=%s thiscall=%s args=%s", + namespace_match, + name_match, + return_type_match, + thiscall_matches, + args_match, + ) + + return ( + name_match + and namespace_match + and return_type_match + and thiscall_matches + and args_match + ) + + def _matches_non_thiscall_parameters( + self, ghidra_function + ): # type: (Function) -> bool + return self._parameter_lists_match(ghidra_function.getParameters()) + + def _matches_thiscall_parameters(self, ghidra_function): # type: (Function) -> bool + ghidra_params = ghidra_function.getParameters() # type: list[Parameter] + + # remove the `this` argument which we don't generate ourselves + ghidra_params.pop(0) + + return self._parameter_lists_match(ghidra_params) + + def _parameter_lists_match(self, ghidra_params): # type: (list[Parameter]) -> bool + if len(self.arguments) != len(ghidra_params): + return False + + for this_arg, ghidra_arg in zip(self.arguments, ghidra_params): + if ( + this_arg.getName() != ghidra_arg.getName() + or this_arg.getDataType() != ghidra_arg.getDataType() + ): + return False + + return True + + def overwrite_ghidra_function(self, ghidra_function): # type: (Function) -> None + """Replace the function declaration in Ghidra by the one derived from C++.""" + ghidra_function.setName(self.name, SourceType.USER_DEFINED) + ghidra_function.setParentNamespace(self.namespace) + ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED) + # not sure what calling convention to choose when it's not a __thiscall, + # so we play it safe and keep whatever Ghidra has + if self.class_name: + ghidra_function.setCallingConvention("__thiscall") + + ghidra_function.replaceParameters( + Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS, + True, + SourceType.USER_DEFINED, + self.arguments, + )