Files
kaizen/external/capstone/suite/auto-sync/src/autosync/MCUpdater.py
T
2026-03-23 12:11:07 +01:00

642 lines
24 KiB
Python

#!/usr/bin/env python3
# Copyright © 2024 Rot127 <unisono@quyllur.org>
# SPDX-License-Identifier: BSD-3
import argparse
import logging as log
import json
import re
import sys
import subprocess as sp
from pathlib import Path
from autosync.Targets import TARGETS_LLVM_NAMING, TARGET_TO_DIR_NAME
from autosync.Helper import convert_loglevel, get_path
class LLVM_MC_Command:
def __init__(self, cmd_line: str, mattr: str):
self.cmd: str = ""
self.opts: str = ""
self.file: Path | None = None
self.additional_mattr: str = mattr
self.cmd, self.opts, self.file = self.parse_llvm_mc_line(cmd_line)
if not (self.cmd and self.opts and self.file):
log.warning(f"Could not parse llvm-mc command: {cmd_line}")
elif not "--show-encoding" in self.cmd:
self.cmd = re.sub("llvm-mc", "llvm-mc --show-encoding", self.cmd)
elif not "--disassemble" in self.cmd:
self.cmd = re.sub("llvm-mc", "llvm-mc --disassemble", self.cmd)
def parse_llvm_mc_line(self, line: str) -> tuple[str, str, Path]:
test_file_base_dir = str(get_path("{LLVM_LIT_TEST_DIR}").absolute())
file = re.findall(rf"{test_file_base_dir}\S+", line)
if not file:
log.warning(f"llvm-mc command doesn't contain a file: {line}")
return None, None, None
test_file = file[0]
cmd = re.sub(rf"{test_file}", "", line).strip()
cmd = re.sub(r"\s+", " ", cmd)
arch = re.finditer(r"(triple|arch)[=\s](\S+)", cmd)
mattr = re.finditer(r"(mattr|mcpu)[=\s](\S+)", cmd)
opts = ",".join([m.group(2) for m in arch]) if arch else ""
if mattr:
opts += "" if not opts else ","
processed_attr = list()
for m in mattr:
attribute = m.group(2).strip("+")
processed_attr.append(attribute)
opts += ",".join(processed_attr)
return cmd, opts, Path(test_file)
def exec(self) -> sp.CompletedProcess:
with open(self.file, "b+r") as f:
content = f.read()
if self.additional_mattr:
# If mattr exists, patch it into the cmd
if "mattr" in self.cmd:
self.cmd = re.sub(
r"mattr[=\s]+", f"mattr={self.additional_mattr} -mattr=", self.cmd
)
else:
self.cmd = re.sub(
r"llvm-mc", f"llvm-mc -mattr={self.additional_mattr}", self.cmd
)
log.debug(f"Run: {self.cmd}")
result = sp.run(self.cmd.split(" "), input=content, capture_output=True)
return result
def get_opts_list(self) -> list[str]:
opts = self.opts.strip().strip(",")
opts = re.sub(r"[, ]+", ",", opts)
return opts.split(",")
def __str__(self) -> str:
return f"{self.cmd} < {str(self.file.absolute())}"
class MCTest:
"""
A single test. It can contain multiple decoded instruction for a given byte sequence.
In general a MCTest always tests a sequence of instructions in a single .text segment.
"""
def __init__(self, arch: str, opts: list[str], encoding: str, asm_text: str):
self.arch = arch
self.opts = opts
self.encoding: list[str] = [encoding]
self.asm_text: list[str] = [asm_text]
def extend(self, encoding: str, asm_text: str):
self.encoding.append(encoding)
self.asm_text.append(asm_text)
def get_legacy_mc_test_triple(self):
"""
Returns the legacy triple for the old MC test files:
<ARCH>, <MODE>, None
Should only be used to generate fuzzing tests.
"""
triple = "# "
if self.arch.startswith("CS_ARCH"):
triple += self.arch
else:
triple += f"CS_ARCH_{self.arch.upper()}"
opts = "|".join([f'"{o}"' for o in self.opts if o.startswith("CS_MODE_")])
if not opts:
opts = "0"
triple += f", {opts}, None"
return triple
def fuzz_test_str(self):
old_mc_tcase = ""
for enc, asm_text in zip(self.encoding, self.asm_text):
if old_mc_tcase:
old_mc_tcase += "\n"
encoding = re.sub(r"[\[\]]", "", enc)
encoding = encoding.strip()
encoding = re.sub(r"[\s,]+", ",", encoding)
old_mc_tcase += f"{encoding} == {asm_text}"
return old_mc_tcase
def __str__(self):
encoding = ",".join(self.encoding)
encoding = re.sub(r"[\[\]]", "", encoding)
encoding = encoding.strip()
encoding = re.sub(r"[\s,]+", ", ", encoding)
yaml_tc = (
" -\n"
" input:\n"
" bytes: [ <ENCODING> ]\n"
' arch: "<ARCH>"\n'
" options: [ <OPTIONS> ]\n"
" expected:\n"
" insns:\n"
)
template = " -\n asm_text: <ASM_TEXT>\n"
insn_cases = ""
for text in self.asm_text:
insn_cases += template.replace("<ASM_TEXT>", f'"{text}"')
yaml_tc = yaml_tc.replace("<ENCODING>", encoding)
yaml_tc = yaml_tc.replace("<ARCH>", f"CS_ARCH_{self.arch.upper()}")
yaml_tc = yaml_tc.replace("<OPTIONS>", ", ".join([f'"{o}"' for o in self.opts]))
yaml_tc += insn_cases
return yaml_tc
class TestFile:
def __init__(
self,
arch: str,
file_path: Path,
opts: list[str] | None,
mc_cmd: LLVM_MC_Command,
unified_test_cases: bool,
):
self.arch: str = arch
self.file_path: Path = file_path
self.opts: list[str] = list() if not opts else opts
self.mc_cmd: LLVM_MC_Command = mc_cmd
# Indexed by .text section count
self.tests: dict[int : list[MCTest]] = dict()
self.init_tests(unified_test_cases)
def init_tests(self, unified_test_cases: bool):
mc_output = self.mc_cmd.exec()
if mc_output.stderr and not mc_output.stdout:
# We can still continue. We just ignore the failed cases.
log.debug(f"llvm-mc cmd stderr: {mc_output.stderr}")
log.debug(f"llvm-mc result: {mc_output}")
text_section = 0 # Counts the .text sections
asm_pat = f"(?P<asm_text>.+)"
enc_pat = r"(\[?(?P<full_enc_string>(?P<enc_bytes>((0x[a-fA-F0-9]{1,2}[, ]{0,2}))+)[^, ]?)\]?)"
dups = []
for line in mc_output.stdout.splitlines():
line = line.decode("utf8")
if ".text" in line:
text_section += 1
continue
match = re.search(
rf"^\s*{asm_pat}\s*(#|//|@|!|;)\s*encoding:\s*{enc_pat}", line
)
if not match:
continue
full_enc_string = match.group("full_enc_string")
if not re.search(r"0x[a-fA-F0-9]{1,2}$", full_enc_string[:-1]):
log.debug(f"Ignore because symbol injection is needed: {line}")
# The encoding string contains symbol information of the form:
# [0xc0,0xe0,A,A,A... or similar. We ignore these for now.
continue
enc_bytes = match.group("enc_bytes").strip()
asm_text = match.group("asm_text").strip()
asm_text = re.sub(r"\t+", " ", asm_text)
asm_text = asm_text.strip()
if not self.valid_byte_seq(enc_bytes):
continue
if (enc_bytes + asm_text) in dups:
continue
dups.append(enc_bytes + asm_text)
if text_section in self.tests:
if unified_test_cases:
self.tests[text_section][0].extend(enc_bytes, asm_text)
else:
self.tests[text_section].append(
MCTest(self.arch, self.opts, enc_bytes, asm_text)
)
else:
self.tests[text_section] = [
MCTest(self.arch, self.opts, enc_bytes, asm_text)
]
def has_tests(self) -> bool:
return len(self.tests) != 0
def get_cs_testfile_content(self, only_tests: bool) -> str:
content = "\n" if only_tests else "test_cases:\n"
for tl in self.tests.values():
content += "\n".join([str(t) for t in tl])
return content
def get_fuzz_test_file_content(self, only_tests: bool) -> str:
content = ""
for tl in self.tests.values():
if not content:
content = (
"\n" if only_tests else tl[0].get_legacy_mc_test_triple() + "\n"
)
content += "\n".join([t.fuzz_test_str() for t in tl])
return content
def num_test_cases(self) -> int:
return len(self.tests)
def valid_byte_seq(self, enc_bytes):
match self.arch:
case "AArch64":
# It always needs 4 bytes.
# Otherwise it is likely a reloc or symbol test
return enc_bytes.count("0x") == 4
case _:
return True
def get_multi_mode_filename(self) -> Path:
filename = self.file_path.stem
parent = self.file_path.parent
prefix_less_opts = [re.sub(r"CS_(OPT|MODE)_", "", o).lower() for o in self.opts]
detailed_name = f"{filename}_{'_'.join(prefix_less_opts)}.txt"
detailed_name = re.sub(r"[+-]", "_", detailed_name)
out_path = parent.joinpath(detailed_name)
return Path(out_path)
def get_simple_filename(self) -> Path:
return self.file_path
def __lt__(self, other) -> bool:
return str(self.file_path) < str(other.file_path)
def exists_and_is_dir(x):
return x.exists() and x.is_dir()
class MCUpdater:
"""
The MCUpdater parses all test files of the LLVM MC regression tests.
Each of those LLVM files can contain several llvm-mc commands to run on the same file.
Mostly this is done to test the same file with different CPU features enabled.
So it can test different flavors of assembly etc.
In Capstone all modules enable always all CPU features (even if this is not
possible in reality).
Due to this we always parse all llvm-mc commands run on a test file, generate a TestFile
object for each of it, but only write the last one of them to disk.
Once https://github.com/capstone-engine/capstone/issues/1992 is resolved, we can
write all variants of a test file to disk.
This is already implemented and tested with multi_mode = True.
"""
def __init__(
self,
arch: str,
mc_dir: Path,
excluded: list[str] | None,
included: list[str] | None,
unified_test_cases: bool,
multi_mode: bool = False,
):
self.symbolic_links = list()
self.arch = arch
self.arch_dir_name = TARGET_TO_DIR_NAME[self.arch]
self.test_dir_link_prefix = f"test_dir_{arch}_"
self.mc_dir = mc_dir
self.excluded = excluded if excluded else list()
self.included = included if included else list()
self.test_files: list[TestFile] = list()
self.unified_test_cases = unified_test_cases
with open(get_path("{MCUPDATER_CONFIG_FILE}")) as f:
self.conf = json.loads(f.read())
# Additional mattr passed to llvm-mc
self.mattr: str = (
",".join(self.conf["additional_mattr"][self.arch])
if self.arch in self.conf["additional_mattr"]
else ""
)
# A list of options which are always added.
self.mandatory_options: list[str] = (
self.conf["mandatory_options"][self.arch]
if self.arch in self.conf["mandatory_options"]
else list()
)
self.default_endianess: str = (
self.conf["default_endianess"][self.arch]
if self.arch in self.conf["default_endianess"]
else ""
)
self.remove_options: str = (
self.conf["remove_options"][self.arch]
if self.arch in self.conf["remove_options"]
else list()
)
self.remove_options = [x.lower() for x in self.remove_options]
self.replace_option_map: dict = (
self.conf["replace_option_map"][self.arch]
if self.arch in self.conf["replace_option_map"]
else {}
)
self.replace_option_map = {
k.lower(): v
for k, v in self.replace_option_map.items()
if k.lower not in self.remove_options
}
self.multi_mode = multi_mode
def check_prerequisites(self, paths):
if all(not exists_and_is_dir(path) for path in paths):
raise ValueError(
f"'{paths}' does not exits or is not a directory. Cannot generate tests from there."
)
llvm_lit_cfg = get_path("{LLVM_LIT_TEST_DIR}")
if not llvm_lit_cfg.exists():
raise ValueError(
f"Could not find '{llvm_lit_cfg}'. Check {{LLVM_LIT_TEST_DIR}} in path_vars.json."
)
def write_to_build_dir(self, fuzzer_tests: bool = False):
no_tests_file = 0
file_cnt = 0
test_cnt = 0
overwritten = 0
files_written = set()
for test in sorted(self.test_files):
if not test.has_tests():
no_tests_file += 1
continue
file_cnt += 1
test_cnt += test.num_test_cases()
if self.multi_mode:
rel_path = str(
test.get_multi_mode_filename().relative_to(
get_path("{LLVM_LIT_TEST_DIR}")
)
)
else:
rel_path = str(
test.get_simple_filename().relative_to(
get_path("{LLVM_LIT_TEST_DIR}")
)
)
filename = re.sub(rf"{self.test_dir_link_prefix}\d+", ".", rel_path)
if fuzzer_tests:
filename = get_path("{MCUPDATER_OUT_FUZZ_DIR}").joinpath(
f"{filename}.cs"
)
else:
filename = get_path("{MCUPDATER_OUT_DIR}").joinpath(f"{filename}.yaml")
if filename in files_written:
write_mode = "a"
else:
write_mode = "w+"
filename.parent.mkdir(parents=True, exist_ok=True)
if self.multi_mode and filename.exists():
log.warning(
f"The following file exists already: {filename}. This indicates a blind spot in testing."
)
overwritten += 1
elif not self.multi_mode and filename.exists():
log.debug(f"Overwrite: {filename}")
overwritten += 1
with open(filename, write_mode) as f:
if fuzzer_tests:
content = test.get_fuzz_test_file_content(
only_tests=(write_mode == "a")
)
else:
content = test.get_cs_testfile_content(
only_tests=(write_mode == "a")
)
f.write(content)
log.debug(f"Write {filename}")
files_written.add(filename)
print()
log.info(
f"Got {len(self.test_files)} {'fuzzing ' if fuzzer_tests else ''}test files.\n"
f"\t\tProcessed {file_cnt} files with {test_cnt} test cases.\n"
f"\t\tIgnored {no_tests_file} without tests.\n"
f"\t\tGenerated {len(files_written)} files"
)
if overwritten > 0:
log.warning(
f"Overwrote {overwritten} test files with the same name.\n"
f"These files contain instructions of several different cpu features.\n"
f"You have to use multi-mode to write them into distinct files.\n"
f"The current setting will only keep the last one written.\n"
f"See also: https://github.com/capstone-engine/capstone/issues/1992\n"
"If you already used multi-mode (default = yes), there might be a blind spot in testing."
)
def build_test_options(self, options):
new_options = [] + self.mandatory_options
for opt in options:
opt = opt.lower()
if opt in self.remove_options:
continue
elif opt in self.replace_option_map:
new_options.extend(self.replace_option_map[opt])
else:
new_options.append(opt)
if (
not any(
[
True
for x in new_options
if x in ["CS_MODE_BIG_ENDIAN", "CS_MODE_LITTLE_ENDIAN"]
]
)
and self.default_endianess
):
new_options.append(self.default_endianess)
return new_options
def build_test_files(self, mc_cmds: list[LLVM_MC_Command]) -> list[TestFile]:
log.info("Build TestFile objects")
test_files = list()
n_all = len(mc_cmds)
for i, mcc in enumerate(mc_cmds):
print(f"{i + 1}/{n_all} {mcc.file.name}", flush=True, end="\r")
opts = self.build_test_options(mcc.get_opts_list())
test_files.append(
TestFile(
self.arch,
mcc.file,
opts,
mcc,
self.unified_test_cases,
)
)
return test_files
def run_llvm_lit(self, paths: list[Path]) -> list[LLVM_MC_Command]:
"""
Calls llvm-lit with the given paths to the tests.
It parses the llvm-lit commands to LLVM_MC_Commands.
"""
lit_cfg_dir = get_path("{LLVM_LIT_TEST_DIR}")
llvm_lit_cfg = str(lit_cfg_dir.absolute())
args = ["lit", "-v", "-a", llvm_lit_cfg]
for i, p in enumerate(paths):
slink = lit_cfg_dir.joinpath(f"{self.test_dir_link_prefix}{i}")
self.symbolic_links.append(slink)
log.debug(f"Create link: {slink} -> {p}")
try:
slink.symlink_to(p, target_is_directory=True)
except FileExistsError as e:
print("Failed: Link existed. Please delete it")
raise e
log.info(f"Run lit: {' '.join(args)}")
cmds = sp.run(args, capture_output=True)
if cmds.stderr:
raise ValueError(f"llvm-lit failed with {cmds.stderr}")
return self.extract_llvm_mc_cmds(cmds.stdout.decode("utf8"))
def extract_llvm_mc_cmds(self, cmds: str) -> list[LLVM_MC_Command]:
log.debug("Parsing llvm-mc commands")
# Get only the RUN lines which have a show-encoding set.
cmd_lines = cmds.splitlines()
log.debug(f"NO FILTER: {cmd_lines}")
matches = list(
filter(
lambda l: (
l
if re.search(r"^RUN.+(show-encoding|disassemble)[^|]+", l)
else None
),
cmd_lines,
)
)
log.debug(f"FILTER RUN: {' '.join(matches)}")
# Don't add tests which are allowed to fail
matches = list(
filter(lambda m: None if re.search(r"not\s+llvm-mc", m) else m, matches)
)
log.debug(f"FILTER not llvm-mc: {' '.join(matches)}")
# Skip object file tests
matches = list(
filter(lambda m: None if re.search(r"filetype=obj", m) else m, matches)
)
log.debug(f"FILTER filetype=obj-mc: {' '.join(matches)}")
# Skip any relocation related tests.
matches = filter(lambda m: None if re.search(r"reloc", m) else m, matches)
# Remove 'RUN: at ...' prefix
matches = map(lambda m: re.sub(r"^RUN: at line \d+: ", "", m), matches)
# Remove redirection
matches = map(lambda m: re.sub(r"\d>&\d", "", m), matches)
# Remove unused arguments
matches = map(lambda m: re.sub(r"-o\s?-", "", m), matches)
# Remove redirection of stderr to a file
matches = map(lambda m: re.sub(r"2>\s?\S+", "", m), matches)
# Remove piping to FileCheck
matches = map(lambda m: re.sub(r"\|\s*FileCheck\s+.+", "", m), matches)
# Remove input stream
matches = map(lambda m: re.sub(r"\s+<", "", m), matches)
all_cmds = list()
for match in matches:
if self.included and not any(
re.search(x, match) is not None for x in self.included
):
continue
if any(re.search(x, match) is not None for x in self.excluded):
continue
llvm_mc_cmd = LLVM_MC_Command(match, self.mattr)
if not llvm_mc_cmd.cmd:
# Invalid
continue
all_cmds.append(llvm_mc_cmd)
log.debug(f"Added: {llvm_mc_cmd}")
log.debug(f"Extracted {len(all_cmds)} llvm-mc commands")
return all_cmds
def gen_all(self):
log.info("Check prerequisites")
test_paths = list()
if self.arch in self.conf["use_assembly_tests"]:
log.info(f"Add assembly tests for {self.arch}")
test_paths.append(self.mc_dir.joinpath(self.arch))
if self.arch not in self.conf["exclude_disassembly_tests"]:
log.info(f"Add disassembly tests for {self.arch}")
disas_tests = self.mc_dir.joinpath(f"Disassembler/{self.arch_dir_name}")
test_paths.append(disas_tests)
self.check_prerequisites(test_paths)
log.info("Generate MC regression tests")
llvm_mc_cmds = self.run_llvm_lit(
[path for path in test_paths if exists_and_is_dir(path)]
)
log.info(f"Got {len(llvm_mc_cmds)} llvm-mc commands to run")
self.test_files = self.build_test_files(llvm_mc_cmds)
for slink in self.symbolic_links:
log.debug(f"Unlink {slink}")
slink.unlink()
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="Test file updater",
description="Synchronizes test files with LLVM",
)
parser.add_argument(
"-d",
dest="mc_dir",
help=f"Path to the LLVM MC test files. Default: {get_path('{LLVM_MC_TEST_DIR}')}",
default=get_path("{LLVM_MC_TEST_DIR}"),
type=Path,
)
parser.add_argument(
"-a",
dest="arch",
help="Name of architecture to update.",
choices=TARGETS_LLVM_NAMING,
required=True,
)
parser.add_argument(
"-e",
dest="excluded_files",
metavar="filename",
nargs="+",
help="File names to exclude from update (can be a regex pattern).",
)
parser.add_argument(
"-i",
dest="included_files",
metavar="filename",
nargs="+",
help="Specific list of file names to update (can be a regex pattern).",
)
parser.add_argument(
"-u",
dest="unified_tests",
action="store_true",
default=False,
help="If set, all instructions of a text segment will decoded and tested at once. Should be set, if instructions depend on each other.",
)
parser.add_argument(
"-v",
dest="verbosity",
help="Verbosity of the log messages.",
choices=["debug", "info", "warning", "fatal"],
default="info",
)
arguments = parser.parse_args()
return arguments
if __name__ == "__main__":
args = parse_args()
log.basicConfig(
level=convert_loglevel(args.verbosity),
stream=sys.stdout,
format="%(levelname)-5s - %(message)s",
force=True,
)
MCUpdater(
args.arch,
args.mc_dir,
args.excluded_files,
args.included_files,
args.unified_tests,
True,
).gen_all()