From fccfdc135a8322fd383b6ddd6a7e35f5ba9302c0 Mon Sep 17 00:00:00 2001 From: Diksha Date: Thu, 4 Jun 2026 20:18:58 +0530 Subject: [PATCH 1/8] Add entrypoint directory to sys.path to support local module imports --- src/datacustomcode/run.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py index 6322270..0d9a110 100644 --- a/src/datacustomcode/run.py +++ b/src/datacustomcode/run.py @@ -201,8 +201,20 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None: def add_py_folder(entrypoint: str): + """Add py-files subfolder and entrypoint directory to sys.path. + + This ensures: + 1. py-files/ is available for additional dependencies + 2. The entrypoint directory is available for local module imports + """ default_py_folder = "py-files" # Hardcoded folder name cwd = Path.cwd().joinpath(entrypoint) - py_folder = cwd.parent.joinpath(default_py_folder) + entrypoint_dir = cwd.parent + py_folder = entrypoint_dir.joinpath(default_py_folder) + + # Add py-files folder if it exists + if py_folder.exists(): + sys.path.insert(0, str(py_folder)) - sys.path.append(str(py_folder)) + # Add entrypoint directory to allow local module imports (e.g., utility.py) + sys.path.insert(0, str(entrypoint_dir)) From c1015c06fa8ae226adf2d5a19b475f4c2de686aa Mon Sep 17 00:00:00 2001 From: Diksha Date: Thu, 4 Jun 2026 20:27:56 +0530 Subject: [PATCH 2/8] Add test for local module imports via add_py_folder --- tests/test_run.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/test_run.py b/tests/test_run.py index dcb8225..1f17a27 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -237,6 +237,48 @@ def test_run_entrypoint_with_dependencies(): sys.path.remove(module_dir) +def test_add_py_folder_enables_local_imports(): + """Test that add_py_folder adds entrypoint directory to sys.path for local imports.""" + from datacustomcode.run import add_py_folder + + # Create a temporary directory structure + temp_dir = tempfile.mkdtemp() + + try: + # Create a utility module in the temp directory + utility_path = os.path.join(temp_dir, "utility.py") + with open(utility_path, "w") as f: + f.write("TEST_VALUE = 'local_module_works'\n") + + # Create an entrypoint file + entrypoint_path = os.path.join(temp_dir, "entrypoint.py") + with open(entrypoint_path, "w") as f: + f.write("# Test entrypoint\n") + + # Save original sys.path + original_path = sys.path.copy() + + # Call add_py_folder with relative path from current directory + relative_entrypoint = os.path.relpath(entrypoint_path) + add_py_folder(relative_entrypoint) + + # verify we can now import the utility module + import utility + + assert hasattr(utility, "TEST_VALUE"), "utility module should have TEST_VALUE" + assert ( + utility.TEST_VALUE == "local_module_works" + ), f"Expected 'local_module_works', got {utility.TEST_VALUE}" + + finally: + # Cleanup + sys.path = original_path + if "utility" in sys.modules: + del sys.modules["utility"] + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + class TestDataspaceScenarios: """Test dataspace functionality in run_entrypoint.""" From 16c5ab490336027e8eb7f85fb7500f4a83298d14 Mon Sep 17 00:00:00 2001 From: Diksha Date: Thu, 4 Jun 2026 20:33:04 +0530 Subject: [PATCH 3/8] Update comments --- src/datacustomcode/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py index 0d9a110..006055c 100644 --- a/src/datacustomcode/run.py +++ b/src/datacustomcode/run.py @@ -207,7 +207,7 @@ def add_py_folder(entrypoint: str): 1. py-files/ is available for additional dependencies 2. The entrypoint directory is available for local module imports """ - default_py_folder = "py-files" # Hardcoded folder name + default_py_folder = "py-files" cwd = Path.cwd().joinpath(entrypoint) entrypoint_dir = cwd.parent py_folder = entrypoint_dir.joinpath(default_py_folder) @@ -216,5 +216,5 @@ def add_py_folder(entrypoint: str): if py_folder.exists(): sys.path.insert(0, str(py_folder)) - # Add entrypoint directory to allow local module imports (e.g., utility.py) + # Add entrypoint directory to allow local module imports sys.path.insert(0, str(entrypoint_dir)) From a24a51cf3ae20de3aa735a4bd736d2e0cfa86822 Mon Sep 17 00:00:00 2001 From: Diksha Date: Thu, 4 Jun 2026 20:36:38 +0530 Subject: [PATCH 4/8] Fix lint error --- tests/test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_run.py b/tests/test_run.py index 1f17a27..1eace88 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -238,7 +238,7 @@ def test_run_entrypoint_with_dependencies(): def test_add_py_folder_enables_local_imports(): - """Test that add_py_folder adds entrypoint directory to sys.path for local imports.""" + """Test that add_py_folder adds entrypoint directory to sys.path.""" from datacustomcode.run import add_py_folder # Create a temporary directory structure From e17397ab3f05d3a2ef4edb90c181ce35810ffc0d Mon Sep 17 00:00:00 2001 From: Diksha Date: Fri, 5 Jun 2026 13:05:02 +0530 Subject: [PATCH 5/8] Update example --- .../function/chunking/payload/entrypoint.py | 82 +------------- .../function/chunking/payload/utility.py | 104 ++++++++++++++++++ 2 files changed, 107 insertions(+), 79 deletions(-) create mode 100644 src/datacustomcode/templates/function/chunking/payload/utility.py diff --git a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py index 8200e0f..81dcb32 100644 --- a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py +++ b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py @@ -1,5 +1,7 @@ import logging +from utility import extract_citations, split_text_into_chunks + from datacustomcode.function import Runtime from datacustomcode.function.feature_types.chunking import ( ChunkType, @@ -15,80 +17,6 @@ DEFAULT_MAX_CHUNK_SIZE = 50 -def split_text_into_chunks(text: str, max_size: int, overlap: int = 20): - """Split text into chunks with overlap, trying to break at natural boundaries. - - Tries to break at natural boundaries in order of preference: - 1. Paragraph boundaries (\\n\\n) - 2. Line boundaries (\\n) - 3. Sentence boundaries (. ! ?) - 4. Word boundaries (space) - 5. Hard cut if no good boundary found - - Args: - text: Text to split - max_size: Maximum characters per chunk - overlap: Number of characters to overlap between chunks - - Returns: - List of text chunks - """ - if len(text) <= max_size: - return [text] - - chunks = [] - start = 0 - - while start < len(text): - # Determine end position for this chunk - end = start + max_size - - if end >= len(text): - # Last chunk - chunks.append(text[start:]) - break - - # Try to find a good breaking point (in order of preference) - chunk_text = text[start:end] - break_point = None - - # Try to break at paragraph boundary (\n\n) - last_paragraph = chunk_text.rfind("\n\n") - if last_paragraph > max_size * 0.5: # Only if it's past halfway - break_point = start + last_paragraph + 2 # +2 to skip the \n\n - - # Try to break at line boundary (\n) - if break_point is None: - last_newline = chunk_text.rfind("\n") - if last_newline > max_size * 0.5: - break_point = start + last_newline + 1 - - # Try to break at sentence boundary (. ! ?) - if break_point is None: - for punct in [". ", "! ", "? "]: - last_sentence = chunk_text.rfind(punct) - if last_sentence > max_size * 0.5: - break_point = start + last_sentence + len(punct) - break - - # Try to break at word boundary (space) - if break_point is None: - last_space = chunk_text.rfind(" ") - if last_space > max_size * 0.5: - break_point = start + last_space + 1 - - # If no good breaking point, just hard cut - if break_point is None: - break_point = end - - chunks.append(text[start:break_point].strip()) - - # Move start position with overlap - start = max(break_point - overlap, start + 1) - - return chunks - - def function( request: SearchIndexChunkingV1Request, runtime: Runtime ) -> SearchIndexChunkingV1Response: @@ -121,11 +49,7 @@ def function( # Create chunk outputs for chunk_text in text_chunks: - # Create citations from source_dmo_fields if available - citations = {} - if metadata and metadata.source_dmo_fields: - for key, value in metadata.source_dmo_fields.items(): - citations[key] = str(value) + citations = extract_citations(metadata) chunk_output = SearchIndexChunkingV1Output( chunk_type=ChunkType.TEXT, diff --git a/src/datacustomcode/templates/function/chunking/payload/utility.py b/src/datacustomcode/templates/function/chunking/payload/utility.py new file mode 100644 index 0000000..06c3dfd --- /dev/null +++ b/src/datacustomcode/templates/function/chunking/payload/utility.py @@ -0,0 +1,104 @@ +"""Utility functions for text chunking operations.""" + +import logging +from typing import ( + Dict, + List, + Optional, +) + +from datacustomcode.function.feature_types.chunking import SearchIndexChunkingV1Metadata + +logger = logging.getLogger(__name__) + + +def split_text_into_chunks(text: str, max_size: int, overlap: int = 20) -> List[str]: + """Split text into chunks with overlap, trying to break at natural boundaries. + + Tries to break at natural boundaries in order of preference: + 1. Paragraph boundaries (\\n\\n) + 2. Line boundaries (\\n) + 3. Sentence boundaries (. ! ?) + 4. Word boundaries (space) + 5. Hard cut if no good boundary found + + Args: + text: Text to split + max_size: Maximum characters per chunk + overlap: Number of characters to overlap between chunks + + Returns: + List of text chunks + """ + if len(text) <= max_size: + return [text] + + chunks = [] + start = 0 + + while start < len(text): + # Determine end position for this chunk + end = start + max_size + + if end >= len(text): + # Last chunk + chunks.append(text[start:]) + break + + # Try to find a good breaking point (in order of preference) + chunk_text = text[start:end] + break_point = None + + # Try to break at paragraph boundary (\n\n) + last_paragraph = chunk_text.rfind("\n\n") + if last_paragraph > max_size * 0.5: # Only if it's past halfway + break_point = start + last_paragraph + 2 # +2 to skip the \n\n + + # Try to break at line boundary (\n) + if break_point is None: + last_newline = chunk_text.rfind("\n") + if last_newline > max_size * 0.5: + break_point = start + last_newline + 1 + + # Try to break at sentence boundary (. ! ?) + if break_point is None: + for punct in [". ", "! ", "? "]: + last_sentence = chunk_text.rfind(punct) + if last_sentence > max_size * 0.5: + break_point = start + last_sentence + len(punct) + break + + # Try to break at word boundary (space) + if break_point is None: + last_space = chunk_text.rfind(" ") + if last_space > max_size * 0.5: + break_point = start + last_space + 1 + + # If no good breaking point, just hard cut + if break_point is None: + break_point = end + + chunks.append(text[start:break_point].strip()) + + # Move start position with overlap + start = max(break_point - overlap, start + 1) + + return chunks + + +def extract_citations( + metadata: Optional[SearchIndexChunkingV1Metadata], +) -> Dict[str, str]: + """Extract citations from document metadata. + + Args: + metadata: Document metadata containing source DMO fields + + Returns: + Dictionary of citation key-value pairs + """ + citations = {} + if metadata and metadata.source_dmo_fields: + for key, value in metadata.source_dmo_fields.items(): + citations[key] = str(value) + return citations From 2df156e7fe48e75fdaa42ccf253329debcfa8492 Mon Sep 17 00:00:00 2001 From: Diksha Date: Mon, 8 Jun 2026 15:15:16 +0530 Subject: [PATCH 6/8] exclude local modules from scan command --- src/datacustomcode/scan.py | 18 ++++++++++-- tests/test_scan.py | 58 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/src/datacustomcode/scan.py b/src/datacustomcode/scan.py index 5e50c5d..0a1c5e6 100644 --- a/src/datacustomcode/scan.py +++ b/src/datacustomcode/scan.py @@ -258,13 +258,27 @@ def visit_ImportFrom(self, node: ast.ImportFrom) -> None: def scan_file_for_imports(file_path: str) -> Set[str]: - """Scan a Python file for external package imports.""" + """Scan a Python file for external package imports. + + Excludes local modules (Python files in the same directory). + """ with open(file_path, "r") as f: code = f.read() tree = ast.parse(code) visitor = ImportVisitor() visitor.visit(tree) - return visitor.imports + + # Filter out local modules (files in the same directory) + file_dir = os.path.dirname(file_path) + filtered_imports = set() + for package in visitor.imports: + # Check if this is a local module (a .py file exists in the same directory) + local_module_path = os.path.join(file_dir, f"{package}.py") + if not os.path.exists(local_module_path): + # Not a local module, keep it in the imports + filtered_imports.add(package) + + return filtered_imports def write_requirements_file(file_path: str) -> str: diff --git a/tests/test_scan.py b/tests/test_scan.py index 2acbc25..ac8d1af 100644 --- a/tests/test_scan.py +++ b/tests/test_scan.py @@ -927,3 +927,61 @@ def test_excluded_packages(self): assert "pyspark" not in imports finally: os.unlink(temp_path) + + def test_local_module_exclusion(self): + """Test that local modules (files in the same directory) are excluded.""" + # Create a temporary directory with multiple Python files + temp_dir = tempfile.mkdtemp() + + try: + # Create a local module file + utility_path = os.path.join(temp_dir, "utility.py") + with open(utility_path, "w") as f: + f.write(textwrap.dedent( + """ + def helper_function(): + return "helper" + """ + )) + + # Create another local module + helpers_path = os.path.join(temp_dir, "helpers.py") + with open(helpers_path, "w") as f: + f.write(textwrap.dedent( + """ + def another_helper(): + return "another" + """ + )) + + # Create the main script that imports both local modules and external packages + main_content = textwrap.dedent( + """ + from utility import helper_function + from helpers import another_helper + import pandas as pd + import numpy as np + """ + ) + main_path = os.path.join(temp_dir, "main.py") + with open(main_path, "w") as f: + f.write(main_content) + + # Scan for imports + imports = scan_file_for_imports(main_path) + + # External packages should be included + assert "pandas" in imports + assert "numpy" in imports + + # Local modules should be excluded + assert "utility" not in imports + assert "helpers" not in imports + + finally: + # Clean up + for file in ["utility.py", "helpers.py", "main.py"]: + file_path = os.path.join(temp_dir, file) + if os.path.exists(file_path): + os.unlink(file_path) + os.rmdir(temp_dir) From 01d9083b50864e26b2ec7cfb111330de2d71e02c Mon Sep 17 00:00:00 2001 From: Diksha Date: Mon, 8 Jun 2026 15:18:57 +0530 Subject: [PATCH 7/8] Update code comments --- src/datacustomcode/scan.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/datacustomcode/scan.py b/src/datacustomcode/scan.py index 0a1c5e6..afcddea 100644 --- a/src/datacustomcode/scan.py +++ b/src/datacustomcode/scan.py @@ -258,24 +258,20 @@ def visit_ImportFrom(self, node: ast.ImportFrom) -> None: def scan_file_for_imports(file_path: str) -> Set[str]: - """Scan a Python file for external package imports. - - Excludes local modules (Python files in the same directory). - """ + """Scan a Python file for external package imports.""" with open(file_path, "r") as f: code = f.read() tree = ast.parse(code) visitor = ImportVisitor() visitor.visit(tree) - # Filter out local modules (files in the same directory) + # Filter out local modules file_dir = os.path.dirname(file_path) filtered_imports = set() for package in visitor.imports: - # Check if this is a local module (a .py file exists in the same directory) + # Check if a .py file exists in the same directory local_module_path = os.path.join(file_dir, f"{package}.py") if not os.path.exists(local_module_path): - # Not a local module, keep it in the imports filtered_imports.add(package) return filtered_imports From 0b35f49328fc69af09a746258f18554e81b3b77c Mon Sep 17 00:00:00 2001 From: Diksha Date: Mon, 8 Jun 2026 15:29:36 +0530 Subject: [PATCH 8/8] Fix lint error --- tests/test_scan.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_scan.py b/tests/test_scan.py index ac8d1af..57ff409 100644 --- a/tests/test_scan.py +++ b/tests/test_scan.py @@ -937,24 +937,28 @@ def test_local_module_exclusion(self): # Create a local module file utility_path = os.path.join(temp_dir, "utility.py") with open(utility_path, "w") as f: - f.write(textwrap.dedent( - """ + f.write( + textwrap.dedent( + """ def helper_function(): return "helper" """ - )) + ) + ) # Create another local module helpers_path = os.path.join(temp_dir, "helpers.py") with open(helpers_path, "w") as f: - f.write(textwrap.dedent( - """ + f.write( + textwrap.dedent( + """ def another_helper(): return "another" """ - )) + ) + ) - # Create the main script that imports both local modules and external packages + # script imports both local modules and external packages main_content = textwrap.dedent( """ from utility import helper_function