import os import re import sys import argparse from typing import List, Iterator, TextIO from collections import namedtuple from enum import Enum class ReaderState(Enum): WANT_OFFSET = 0 WANT_SIG = 1 IN_FUNC = 2 CodeBlock = namedtuple('CodeBlock', ['offset', 'signature', 'start_line', 'end_line']) # To match a reasonable variance of formatting for the offset comment offsetCommentRegex = re.compile(r'//\s?OFFSET:\s?\w+ (?:0x)?([a-f0-9]+)', flags=re.I) # To match the exact syntax (text upper case, hex lower case, with spaces) # that is used in most places offsetCommentExactRegex = re.compile(r'// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)') def is_blank_or_comment(line: str) -> bool: """Helper to read ahead adter the offset comment is matched. There could be blank lines or other comments before the function signature, and we want to skip those.""" line_strip = line.strip() return (len(line_strip) == 0 or line_strip.startswith('//') or line_strip.startswith('/*') or line_strip.endswith('*/')) def is_exact_offset_comment(line: str) -> bool: """If the offset comment does not match our (unofficial) syntax we may want to alert the user to fix it for style points.""" return offsetCommentExactRegex.match(line) is not None def match_offset_comment(line: str) -> str | None: # TODO: intended to skip the expensive regex match, but is it necessary? if not line.startswith('//'): return None match = offsetCommentRegex.match(line) return match.group(1) if match is not None else None def find_code_blocks(stream: TextIO) -> List[CodeBlock]: """Read the IO stream (file) line-by-line and give the following report: Foreach code block (function) in the file, what are its starting and ending line numbers, and what is the given offset in the original binary. We expect the result to be ordered by line number because we are reading the file from start to finish.""" blocks = [] offset = None function_sig = None start_line = None state = ReaderState.WANT_OFFSET for line_no, line in enumerate(stream): if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC): # Naive but reasonable assumption that functions will end with # a curly brace on its own line with no prepended spaces. if line.startswith('}'): # TODO: could streamline this and the next case block = CodeBlock(offset=offset, signature=function_sig, start_line=start_line, end_line=line_no) blocks.append(block) state = ReaderState.WANT_OFFSET elif match_offset_comment(line) is not None: # We hit another offset unexpectedly before detecting the # end of the function. We can recover easily by just # ending the function here. block = CodeBlock(offset=offset, signature=function_sig, start_line=start_line, end_line=line_no - 1) blocks.append(block) state = ReaderState.WANT_OFFSET # We want to grab the function signature so we can identify # the code block. Skip any blank lines or comments # that follow the offset comment. elif (not is_blank_or_comment(line) and state == ReaderState.WANT_SIG): function_sig = line.strip() state = ReaderState.IN_FUNC if state == ReaderState.WANT_OFFSET: match = match_offset_comment(line) if match is not None: offset = int(match, 16) start_line = line_no state = ReaderState.WANT_SIG return blocks def file_is_cpp(filename: str) -> bool: # TODO: expand to check header files also? (basefile, ext) = os.path.splitext(filename) return ext.lower() == '.cpp' def walk_source_dir(source: str) -> Iterator[tuple]: """Generator to walk the given directory recursively and return any .cpp files found.""" for subdir, dirs, files in os.walk(source): for file in files: if not file_is_cpp(file): continue yield os.path.join(subdir, file) def sig_truncate(sig: str) -> str: """Helper to truncate function names to 50 chars and append ellipsis if needed. Goal is to stay under 80 columns for tool output.""" return f"{sig[:47]}{'...' if len(sig) >= 50 else ''}" def get_inexact_offset_comments(stream: TextIO) -> [tuple]: """Read the file stream and return the line number and string for any offset comments that don't exactly match the template.""" return ([ (line_no, line.strip()) for line_no, line in enumerate(stream) if match_offset_comment(line) and not is_exact_offset_comment(line) ]) def check_file(filename: str, verbose: bool = False) -> bool: """Open and read the given file, then check whether the code blocks are in order. If verbose, print each block.""" with open(filename, 'r') as f: code_blocks = find_code_blocks(f) # TODO: Should combine these checks if/when we refactor. # This is just for simplicity / proof of concept. f.seek(os.SEEK_SET, 0) bad_comments = get_inexact_offset_comments(f) just_offsets = [block.offset for block in code_blocks] sorted_offsets = sorted(just_offsets) file_out_of_order = just_offsets != sorted_offsets # If we detect inexact comments, don't print anything unless we are # in verbose mode. If the file is out of order, we always print the # file name. should_report = ((len(bad_comments) > 0 and verbose) or file_out_of_order) if not should_report and not file_out_of_order: return False # Else: we are alerting to some problem in this file print(filename) if verbose: if file_out_of_order: order_lookup = {k: i for i, k in enumerate(sorted_offsets)} prev_offset = 0 for block in code_blocks: msg = ' '.join([ ' ' if block.offset > prev_offset else '!', f'{block.offset:08x}', f'{block.end_line - block.start_line:4} lines', f'{order_lookup[block.offset]:3}', ' ', sig_truncate(block.signature), ]) print(msg) prev_offset = block.offset for (line_no, line) in bad_comments: print(f'* line {line_no:3} bad offset comment ({line})') print() return file_out_of_order def parse_args(test_args: list | None = None) -> dict: p = argparse.ArgumentParser() p.add_argument('target', help='The file or directory to check.') p.add_argument('--enforce', action=argparse.BooleanOptionalAction, default=False, help='Fail with error code if target is out of order.') p.add_argument('--verbose', action=argparse.BooleanOptionalAction, default=False, help=('Display each code block in the file and show ' 'where each consecutive run of blocks is broken.')) if test_args is None: args = p.parse_args() else: args = p.parse_args(test_args) return vars(args) def main(): args = parse_args() if os.path.isdir(args['target']): files_to_check = list(walk_source_dir(args['target'])) elif os.path.isfile(args['target']) and file_is_cpp(args['target']): files_to_check = [args['target']] else: sys.exit('Invalid target') files_out_of_order = 0 for file in files_to_check: is_jumbled = check_file(file, args['verbose']) if is_jumbled: files_out_of_order += 1 if files_out_of_order > 0: error_message = ' '.join([ str(files_out_of_order), 'files are' if files_out_of_order > 1 else 'file is', 'out of order' ]) print(error_message) if files_out_of_order > 0 and args['enforce']: sys.exit(1) if __name__ == '__main__': main()