diff --git a/build.py b/build.py index 64e7d4f..7510c96 100644 --- a/build.py +++ b/build.py @@ -11,7 +11,10 @@ from jinja2 import Environment, select_autoescape, FileSystemLoader from pipeline.translator import PythonBuilder -from pipeline.utils import clone_sources, SchemaLoader, InstanceLoader +from pipeline.utils import ( + clone_sources, SchemaLoader, InstanceLoader, + build_instance_id_to_ref, generate_directory_instance_files, +) include_instances = True # to speed up the build during development, set this to False @@ -52,6 +55,9 @@ print(f"Loaded instances ({perf_counter() - start_time} s)") python_modules = defaultdict(list) +directories_with_patches = set() # (version_module, dir_path) pairs needing _instance_patches.py + +env = Environment(loader=FileSystemLoader(os.path.dirname(os.path.realpath(__file__))), autoescape=select_autoescape()) for schema_version in schema_loader.get_schema_versions(): @@ -62,11 +68,12 @@ embedded = set() linked = set() class_to_module_map = {} + class_full_modules = {} for schema_file_path in schemas_file_paths: emb, lnk = PythonBuilder(schema_file_path, schema_loader.schemas_sources).get_edges() class_to_module_map = PythonBuilder( schema_file_path, schema_loader.schemas_sources - ).update_class_to_module_map(class_to_module_map) + ).update_class_to_module_map(class_to_module_map, class_full_modules) embedded.update(emb) linked.update(lnk) conflicts = linked.intersection(embedded) @@ -79,18 +86,43 @@ linked.remove(schema_identifier) # Step 4b - translate and build each openMINDS schema as a Python class + instance_data_by_dir = defaultdict(list) # dir_path → [(class_name, full_module_path, instances_raw)] + for schema_file_path in schemas_file_paths: - module_path, class_name = PythonBuilder( + builder = PythonBuilder( schema_file_path, schema_loader.schemas_sources, instances=instances.get(schema_version, None), additional_methods=additional_methods, - ).build(embedded=embedded, class_to_module_map=class_to_module_map) + ) + module_path, class_name = builder.build( + embedded=embedded, class_to_module_map=class_to_module_map, class_full_modules=class_full_modules + ) + + if builder.context["instances_raw"]: + dir_parts = builder.relative_path_without_extension[:-1] + dir_path = "/".join(dir_parts) + instance_data_by_dir[dir_path].append( + (class_name, class_full_modules[class_name], builder.context["instances_raw"]) + ) parts = module_path.split(".") parent_path = ".".join(parts[:-1]) python_modules[parent_path].append((parts[-1], class_name)) + # Step 4c - generate _instances.py for each leaf directory that has instances + version_module = schema_version.split(".")[0] + all_instances_for_version = instances.get(schema_version, {}) + id_to_ref = build_instance_id_to_ref(all_instances_for_version) + + for dir_path, dir_class_data in instance_data_by_dir.items(): + has_patches = generate_directory_instance_files( + version_module, dir_path, dir_class_data, + all_instances_for_version, id_to_ref, class_full_modules, env + ) + if has_patches: + directories_with_patches.add((version_module, dir_path)) + print(f"Processed schemas ({perf_counter() - start_time} s)") @@ -104,6 +136,9 @@ with open(init_file_path, "w") as fp: for class_module, class_name in sorted(classes): fp.write(f"from .{class_module} import {class_name}\n") + rel_dir_path = "/".join(dir_path[3:]) + if (dir_path[2], rel_dir_path) in directories_with_patches: + fp.write("from . import _instance_patches as _ # noqa: F401\n") while len(dir_path) > 3: child_dir = dir_path[-1] dir_path = dir_path[:-1] @@ -118,7 +153,6 @@ with open(init_file_path, "w") as fp: fp.write(f"from . import ({', '.join(sorted(module_list))})\n") -env = Environment(loader=FileSystemLoader(os.path.dirname(os.path.realpath(__file__))), autoescape=select_autoescape()) context = { "version": "0.5.1", } diff --git a/pipeline/src/instance_patches_template.py.txt b/pipeline/src/instance_patches_template.py.txt new file mode 100644 index 0000000..836c847 --- /dev/null +++ b/pipeline/src/instance_patches_template.py.txt @@ -0,0 +1,9 @@ +# this file was auto-generated! + +{% for imp in imports %} +{{imp}} +{% endfor %} + +{% for class_name, inst_name, prop_name, value in patches %} +{{class_name}}.{{inst_name}}.{{prop_name}} = {{value}} +{% endfor %} diff --git a/pipeline/src/instances_template.py.txt b/pipeline/src/instances_template.py.txt new file mode 100644 index 0000000..e0aa796 --- /dev/null +++ b/pipeline/src/instances_template.py.txt @@ -0,0 +1,24 @@ +# this file was auto-generated! + +{% for imp in imports %} +{{imp}} +{% endfor %} + +{% for class_name, inst_name, props in phase1_instances %} +{{class_name}}.{{inst_name}} = {{class_name}}( + {% for key, value in props.items() -%} + {% if value is string -%} + {% if value.startswith('http') and key != 'id' -%} + {{key}}=IRI("{{value}}"), + {%- else -%} + {{key}}="{{value}}", + {%- endif %} + {%- else -%} + {{key}}={{value}}, + {%- endif %} + {% endfor -%} +) +{% endfor %} +{% for class_name, inst_name, prop_name, value in phase2_patches %} +{{class_name}}.{{inst_name}}.{{prop_name}} = {{value}} +{% endfor %} diff --git a/pipeline/src/module_template.py.txt b/pipeline/src/module_template.py.txt index 2be2a9f..f565a15 100644 --- a/pipeline/src/module_template.py.txt +++ b/pipeline/src/module_template.py.txt @@ -48,17 +48,5 @@ class {{ class_name }}({{ base_class }}): {{ additional_methods }} -{% for instance_name, instance in instances.items() %} -{{ class_name }}.{{ instance_name }} = {{ class_name }}( - {% for key, value in instance.items() -%} - {% if value is string -%} - {% if value.startswith('http') and key != 'id' -%} - {{key}}=IRI("{{value}}"), - {%- else -%} - {{key}}="{{value}}", - {%- endif %} - {%- else -%} - {{key}}={{value}}, - {%- endif %} - {% endfor -%} -){% endfor %} + +{{instances_trigger}} diff --git a/pipeline/tests/test_regressions.py b/pipeline/tests/test_regressions.py index d1e6d16..4f35986 100644 --- a/pipeline/tests/test_regressions.py +++ b/pipeline/tests/test_regressions.py @@ -583,3 +583,29 @@ def test_issue0084(om): "name": "test", "order": 0, } + + +@pytest.mark.parametrize("om", [openminds.v5, openminds.latest]) +def test_issue0094(om, tmp_path): + # https://github.com/openMetadataInitiative/openMINDS_Python/issues/94 + # Accessibility library instances store payment_models as dicts instead of + # PaymentModelType objects, causing KeyError on Collection.load() + + PaymentModelType = om.controlled_terms.PaymentModelType + + acc = om.core.Accessibility.direct_virtual_open_access + + # Properties should be typed objects, not dicts + assert not isinstance(acc.payment_models[0], dict) + assert isinstance(acc.payment_models[0], PaymentModelType) + + # Save and reload should not raise KeyError + c = Collection() + c.add(acc) + c.save(str(tmp_path), individual_files=True, group_by_schema=True) + + c2 = Collection() + c2.load(str(tmp_path), version=om.__name__.split(".")[1]) + + acc2 = next(item for item in c2 if isinstance(item, om.core.Accessibility)) + assert acc2.id == acc.id diff --git a/pipeline/translator.py b/pipeline/translator.py index 5ffc828..d283024 100644 --- a/pipeline/translator.py +++ b/pipeline/translator.py @@ -100,7 +100,7 @@ def _version_module(self): def _target_file_without_extension(self) -> str: return os.path.join(self._version_module, "/".join(self.relative_path_without_extension)) - def translate(self, embedded=None, class_to_module_map=None): + def translate(self, embedded=None, class_to_module_map=None, class_full_modules=None): def get_type(property): type_map = { "string": "str", @@ -164,23 +164,31 @@ def get_type(property): else: base_class = "LinkedMetadata" - def filter_value(value): + has_instances = bool(self.instances.get(openminds_type)) + module_name = self.relative_path_without_extension[-1] + + def filter_value_strings(value): + """Normalize strings only — PythonRef conversion happens in *_instances.py generation.""" if isinstance(value, str): return value.replace('"', "'").replace("\n", " ") + if isinstance(value, list): + return [filter_value_strings(item) for item in value] return value def filter_instance(instance): - filtered_instance = { - k: filter_value(v) for k, v in instance.items() if k[0] != "@" and k[:4] != "http" and v is not None + filtered = { + k: filter_value_strings(v) + for k, v in instance.items() + if k[0] != "@" and k[:4] != "http" and v is not None } - filtered_instance["id"] = instance["@id"] - return filtered_instance + filtered["id"] = instance["@id"] + return filtered - instances = { - generate_python_name(instance["@id"].split("/")[-1]): filter_instance(instance) - for instance in self.instances.get(openminds_type, []) + instances_raw = { + generate_python_name(inst["@id"].split("/")[-1]): filter_instance(inst) + for inst in self.instances.get(openminds_type, []) } - instances = {name: instances[name] for name in sorted(instances)} # sort by key + instances_raw = {name: instances_raw[name] for name in sorted(instances_raw)} properties = [] for iri, property in self._schema_payload["properties"].items(): @@ -211,9 +219,10 @@ def filter_instance(instance): } ) # unused in property: "nameForReverseLink" - for instance in instances.values(): + for instance in instances_raw.values(): if property["name"] in instance: instance[pythonic_name] = instance.pop(property["name"]) + self.context = { "docstring": self._schema_payload.get("description", ""), "base_class": base_class, @@ -223,13 +232,13 @@ def filter_instance(instance): "schema_version": self.version, "context_vocab": self.context_vocab, "properties": sorted(properties, key=lambda p: p["name"].lower()), - "additional_methods": "", - "instances": instances, + "additional_methods": self.additional_methods["by_name"] if has_instances else "", + "instances_trigger": ( + f"from . import {module_name}_instances as _ # noqa: F401" if has_instances else "" + ), + "instances_raw": instances_raw, } - if len(instances) > 0: - self.context["additional_methods"] = self.additional_methods["by_name"] - import_map = { "date": "from datetime import date", "datetime": "from datetime import datetime", @@ -239,8 +248,6 @@ def filter_instance(instance): "Real": "from numbers import Real", } extra_imports = set() - if len(instances) > 0: - extra_imports.add(import_map["IRI"]) for property in self.context["properties"]: if isinstance(property["type"], list): for t in property["type"]: @@ -251,14 +258,14 @@ def filter_instance(instance): imp = import_map.get(property["type"], None) if imp: extra_imports.add(imp) - if extra_imports: - self.context["preamble"] = "\n".join(sorted(extra_imports)) + if extra_imports: + self.context["preamble"] = "\n".join(sorted(extra_imports)) - def build(self, embedded=None, class_to_module_map=None): + def build(self, embedded=None, class_to_module_map=None, class_full_modules=None): target_file_path = os.path.join("target", "openminds", f"{self._target_file_without_extension()}.py") os.makedirs(os.path.dirname(target_file_path), exist_ok=True) - self.translate(embedded=embedded, class_to_module_map=class_to_module_map) + self.translate(embedded=embedded, class_to_module_map=class_to_module_map, class_full_modules=class_full_modules) with open(target_file_path, "w") as target_file: contents = self.env.get_template(self.template_name).render(self.context) @@ -274,7 +281,7 @@ def get_edges(self): linked.update(property.get("_linkedTypes", [])) return embedded, linked - def update_class_to_module_map(self, class_to_module_map): + def update_class_to_module_map(self, class_to_module_map, class_full_modules=None): """ Updates a dictionary with the class name and its corresponding module based on the schemas. @@ -288,6 +295,9 @@ def update_class_to_module_map(self, class_to_module_map): Args: class_to_module_map (dict): A dictionary where keys are class names and values are their corresponding modules. + class_full_modules (dict, optional): If provided, also populated with + class_name → full dotted submodule path + (e.g. "core.miscellaneous.accessibility"). Returns: dict: The updated dictionary with the class name and module mapping. @@ -301,4 +311,7 @@ def update_class_to_module_map(self, class_to_module_map): class_to_module_map[class_name] = module + if class_full_modules is not None: + class_full_modules[class_name] = ".".join(self.relative_path_without_extension) + return class_to_module_map diff --git a/pipeline/utils.py b/pipeline/utils.py index 45ac484..fc4e3bb 100644 --- a/pipeline/utils.py +++ b/pipeline/utils.py @@ -5,6 +5,257 @@ from git import Repo, GitCommandError +from pipeline.translator import generate_python_name + + +class PythonRef: + """A bare Python identifier/expression rendered without quotes in generated code.""" + + def __init__(self, ref): + self.ref = ref + + def __repr__(self): + return self.ref + + def __str__(self): + return self.ref + + +def build_instance_id_to_ref(instances_by_type): + """Build a map from instance @id → (class_name, python_attr_name) for all instances.""" + id_to_ref = {} + for type_iri, type_instances in instances_by_type.items(): + ref_class = type_iri.split("/")[-1] + for inst in type_instances: + ref_name = generate_python_name(inst["@id"].split("/")[-1]) + id_to_ref[inst["@id"]] = (ref_class, ref_name) + return id_to_ref + + +def build_instance_ref_graph(instances_by_type, id_to_ref): + """Build a directed graph: class_name → set of class_names it references via instances.""" + graph = {} + + def collect(val, source_class): + if isinstance(val, dict) and set(val.keys()) == {"@id"}: + entry = id_to_ref.get(val["@id"]) + if entry and entry[0] != source_class: + graph.setdefault(source_class, set()).add(entry[0]) + elif isinstance(val, list): + for item in val: + collect(item, source_class) + + for type_iri, insts in instances_by_type.items(): + source_class = type_iri.split("/")[-1] + for inst in insts: + for val in inst.values(): + collect(val, source_class) + return graph + + +def _resolve_value(value, id_to_ref, referenced_classes): + """Convert {'@id': '...'} instance refs to PythonRef. Collect referenced class names.""" + if isinstance(value, dict) and set(value.keys()) == {"@id"}: + entry = id_to_ref.get(value["@id"]) + if entry: + ref_class, ref_name = entry + referenced_classes.add(ref_class) + return PythonRef(f"{ref_class}.{ref_name}") + elif isinstance(value, list): + return [_resolve_value(item, id_to_ref, referenced_classes) for item in value] + return value + + +def _has_cyclic_ref(value, deferred_classes): + """True if value contains a PythonRef whose class is in deferred_classes.""" + if isinstance(value, PythonRef): + return value.ref.split(".")[0] in deferred_classes + if isinstance(value, list): + return any(_has_cyclic_ref(item, deferred_classes) for item in value) + return False + + +def _instance_needs_iri(props): + """ + True if any property of props will be rendered as IRI(...) by the instances template, + i.e. is a string starting with 'http' in a property other than 'id'. + """ + return any( + isinstance(value, str) and value.startswith("http") for key, value in props.items() if key != "id" + ) + + +def _collect_python_ref_classes(value, classes): + """Collect the class name of every PythonRef found within value (recursing into lists).""" + if isinstance(value, PythonRef): + classes.add(value.ref.split(".")[0]) + elif isinstance(value, list): + for item in value: + _collect_python_ref_classes(item, classes) + + +def _sort_classes_by_creation_order(dependency_graph, classes): + """ + Sort classes so dependencies come before the classes that depend on them. + + Returns (order, cyclic_edges) where: + order — class names in creation order (dependencies first) + cyclic_edges — set of (referencing_class, referenced_class) pairs that form cycles; + referencing_class's refs to referenced_class must be deferred to phase 2 + """ + UNVISITED, VISITING, DONE = 0, 1, 2 + visit_state = {cls: UNVISITED for cls in classes} + order = [] + cyclic_edges = set() + + def visit(cls_name): + visit_state[cls_name] = VISITING + for neighbor in sorted(dependency_graph.get(cls_name, set())): + if neighbor not in classes: + continue + if visit_state[neighbor] == VISITING: + cyclic_edges.add((cls_name, neighbor)) + elif visit_state[neighbor] == UNVISITED: + visit(neighbor) + visit_state[cls_name] = DONE + order.append(cls_name) + + for cls in sorted(classes): + if visit_state[cls] == UNVISITED: + visit(cls) + + return order, cyclic_edges + + +def generate_class_instances_file(version_module, dir_path, cls_name, full_module_path, + instances_raw, deferred_classes, id_to_ref, + class_full_modules, env): + """ + Write target/openminds/{version_module}/{dir_path}/{module_name}_instances.py + containing only cls_name's own instances. + + Properties whose value references another class in deferred_classes are routed + to cross_class_patches (returned) rather than rendered here, so that this file + never needs to import a class it is cyclically coupled with. + """ + module_name = full_module_path.split(".")[-1] + referenced_classes = set() + phase1_instances = [] + phase2_patches = [] + cross_class_patches = [] + + for inst_name, inst_data in instances_raw.items(): + phase1_props = {} + for prop_name, value in inst_data.items(): + local_refs = set() + filtered = _resolve_value(value, id_to_ref, local_refs) + if _has_cyclic_ref(filtered, deferred_classes): + if local_refs - {cls_name}: + cross_class_patches.append((cls_name, inst_name, prop_name, filtered)) + else: + phase2_patches.append((cls_name, inst_name, prop_name, filtered)) + referenced_classes |= local_refs + else: + phase1_props[prop_name] = filtered + referenced_classes |= local_refs + phase1_instances.append((cls_name, inst_name, phase1_props)) + + imports = {f"from openminds.{version_module}.{full_module_path} import {cls_name}"} + if any(_instance_needs_iri(props) for _cls_name, _inst_name, props in phase1_instances): + imports.add("from openminds.base import IRI") + for ref_class in referenced_classes: + if ref_class in class_full_modules: + full_path = class_full_modules[ref_class] + imports.add(f"from openminds.{version_module}.{full_path} import {ref_class}") + + context = { + "imports": sorted(imports), + "phase1_instances": phase1_instances, + "phase2_patches": phase2_patches, + } + + output_path = os.path.join( + "target", "openminds", version_module, dir_path, f"{module_name}_instances.py" + ) + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as fp: + contents = env.get_template("pipeline/src/instances_template.py.txt").render(context) + fp.write(contents) + + return cross_class_patches + + +def generate_instance_patches_file(version_module, dir_path, cross_class_patches, class_full_modules, env): + """Write target/openminds/{version_module}/{dir_path}/_instance_patches.py.""" + imports = set() + for cls_name, _inst_name, _prop_name, value in cross_class_patches: + referenced_classes = {cls_name} + _collect_python_ref_classes(value, referenced_classes) + for ref_class in referenced_classes: + if ref_class in class_full_modules: + full_path = class_full_modules[ref_class] + imports.add(f"from openminds.{version_module}.{full_path} import {ref_class}") + + context = { + "imports": sorted(imports), + "patches": cross_class_patches, + } + + output_path = os.path.join("target", "openminds", version_module, dir_path, "_instance_patches.py") + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as fp: + contents = env.get_template("pipeline/src/instance_patches_template.py.txt").render(context) + fp.write(contents) + + +def generate_directory_instance_files(version_module, dir_path, dir_class_data, + all_instances_for_version, id_to_ref, + class_full_modules, env): + """ + Write one {module_name}_instances.py per class with instances in this directory, + plus a shared _instance_patches.py for any cross-class cyclic references. + + Returns True if a _instance_patches.py file was written (so the caller can wire + up its import trigger in __init__.py), False otherwise. + """ + classes = {item[0] for item in dir_class_data} + + # Build a dependency_graph restricted to intra-directory edges (for cycle detection) + dir_instances_by_type = {} + for cls_name, _full_mod, _raw in dir_class_data: + for type_iri, type_insts in all_instances_for_version.items(): + if type_iri.split("/")[-1] == cls_name: + dir_instances_by_type[type_iri] = type_insts + break + + full_dependency_graph = build_instance_ref_graph(dir_instances_by_type, id_to_ref) + local_dependency_graph = { + class_name: {dependency for dependency in dependencies if dependency in classes} + for class_name, dependencies in full_dependency_graph.items() + if class_name in classes + } + + _order, cyclic_edges = _sort_classes_by_creation_order(local_dependency_graph, classes) + + all_cross_class_patches = [] + for cls_name, full_module_path, instances_raw in dir_class_data: + # Defer cross-class cyclic refs AND same-class refs (forward-reference risk) + deferred_classes = { + referenced_class for (referencing_class, referenced_class) in cyclic_edges + if referencing_class == cls_name + } | {cls_name} + + cross_class_patches = generate_class_instances_file( + version_module, dir_path, cls_name, full_module_path, instances_raw, + deferred_classes, id_to_ref, class_full_modules, env + ) + all_cross_class_patches.extend(cross_class_patches) + + if all_cross_class_patches: + generate_instance_patches_file(version_module, dir_path, all_cross_class_patches, class_full_modules, env) + return True + return False + def clone_sources(branch="main"): if os.path.exists("_sources"):