Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ The SDK automatically handles all dependency packaging for Data Cloud deployment
├── payload
│ ├── config.json
│ ├── entrypoint.py
├── files
│ ├── data.csv
├── files
├── data.csv
```

## py-files directory
Expand All @@ -137,18 +137,18 @@ Your Python dependencies can be packaged as .py files, .zip archives (containing
├── payload
│ ├── config.json
│ ├── entrypoint.py
├── py-files
│ ├── moduleA
│ │ ├── __init__.py
│ │ ├── moduleA.py
├── py-files
├── moduleA
│ │ ├── __init__.py
│ │ ├── moduleA.py
```

## API

Your entry point script will define logic using the `Client` object which wraps data access layers.

You should only need the following methods:
* `find_file_path(file_name)` - Returns a file path
* `find_file_path(file_name)` – Resolve a bundled file (placed under `payload/files/`) to a `pathlib.Path` that exists. Works the same locally and inside Data Cloud — see [Bundled file resolution](#bundled-file-resolution) below for the full lookup order. Raises `FileNotFoundError` if the file isn't found.
* `read_dlo(name)` – Read from a Data Lake Object by name
* `read_dmo(name)` – Read from a Data Model Object by name
* `write_to_dlo(name, spark_dataframe, write_mode)` – Write to a Data Model Object by name with a Spark dataframe
Expand All @@ -169,6 +169,24 @@ client.write_to_dlo('output_DLO')
> [!WARNING]
> Currently we only support reading from DMOs and writing to DMOs or reading from DLOs and writing to DLOs, but they cannot mix.

### Bundled file resolution

Place bundled files (CSVs, prompt files, etc.) under `payload/files/`. The same `client.find_file_path("data.csv")` call resolves consistently across all three runtimes:

- `datacustomcode run` (local) → `<cwd>/payload/files/data.csv`
- Data Cloud script package → `$LIBRARY_PATH/files/data.csv`
- Data Cloud function package → `$LIBRARY_PATH/files/data.csv`

Resolution order (first existing path wins):

1. `$LIBRARY_PATH/files/<file_name>`, then `$LIBRARY_PATH/<file_name>` — when `LIBRARY_PATH` is set. Data Cloud sets this for you to the package root.
2. `payload/files/<file_name>` relative to the current working directory.
3. `<config_dir>/files/<file_name>` where `<config_dir>` is the directory of the nearest `config.json` discoverable by walking down from cwd.

If none of these exist, `find_file_path` raises `FileNotFoundError` with the list of paths it tried.

`$LIBRARY_PATH` is set automatically to the root of the package at runtime inside Data Cloud.


## CLI

Expand Down
32 changes: 31 additions & 1 deletion src/datacustomcode/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,38 @@ def write_to_dmo(
return self._writer.write_to_dmo(name, dataframe, write_mode, **kwargs) # type: ignore[no-any-return]

def find_file_path(self, file_name: str) -> Path:
"""Return a file path"""
"""Resolve a bundled file shipped in the package to an absolute path.

Resolution order (first existing path wins):

1. ``$LIBRARY_PATH/<file_folder>/<file_name>`` then
``$LIBRARY_PATH/<file_name>`` — when the ``LIBRARY_PATH`` environment
variable is set. The Data Cloud runtime sets this to the directory
containing the extracted package.
2. ``<code_package>/<file_folder>/<file_name>`` relative to the current
working directory — the default ``payload/files/<file_name>`` layout
used by ``datacustomcode run`` from a project root.
3. ``<config_dir>/<file_folder>/<file_name>`` where ``<config_dir>`` is
the directory containing the nearest ``config.json`` discoverable
by walking the cwd subtree.

``LIBRARY_PATH`` must point to the directory that *contains*
``files/`` — i.e., the package root, the same directory that holds
``config.json`` and ``entrypoint.py``. See
``docs/byoc_runtime_contract.md`` for the full runtime contract.

Args:
file_name: A file under the package's ``files/`` folder. Relative
subpaths (e.g., ``"file/data2.csv"``) are supported.

Returns:
A ``pathlib.Path`` that exists.

Raises:
FileNotFoundError: If the file does not exist at any of the
resolution-order locations. The message lists every candidate
path that was tried.
"""
return self._file.find_file_path(file_name) # type: ignore[no-any-return]

def _validate_data_layer_history_does_not_contain(
Expand Down
57 changes: 27 additions & 30 deletions src/datacustomcode/file/path/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import os
from pathlib import Path
from typing import Optional
from typing import Iterator, Optional

from datacustomcode.file.base import BaseDataAccessLayer

Expand Down Expand Up @@ -66,54 +66,48 @@ def find_file_path(self, file_name: str) -> Path:
file_name: The name of the file to open

Returns:
A file path
A file path that exists

Raises:
FileNotFoundError: If the file cannot be found
"""
if not file_name:
raise ValueError("file_name cannot be empty")

file_path = self._resolve_file_path(file_name)
tried: list[Path] = []
for candidate in self._candidate_paths(file_name):
tried.append(candidate)
if candidate.exists():
return candidate

if not file_path.exists():
raise FileNotFoundError(
f"File '{file_name}' not found in any search location"
)
raise FileNotFoundError(
f"File '{file_name}' not found in any search location. "
f"Tried: {[str(p) for p in tried]}"
)

return file_path

def _resolve_file_path(self, file_name: str) -> Path:
"""Resolve the full path to a file.
def _candidate_paths(self, file_name: str) -> Iterator[Path]:
"""Yield candidate paths for ``file_name`` in resolution order.

Args:
file_name: The name of the file to resolve

Returns:
The full path to the file
An iterator of candidate paths
"""
# First check if environment variable is set
# 1. $LIBRARY_PATH/<file_folder>/<file_name>, then $LIBRARY_PATH/<file_name>
env_path = os.getenv(self.DEFAULT_ENV_VAR)
if env_path:
file_path = Path(env_path) / file_name
if file_path.exists():
return file_path
yield Path(env_path) / self.file_folder / file_name
yield Path(env_path) / file_name

# First try the default code package location
# 2. <code_package>/<file_folder>/<file_name> relative to cwd
if self._code_package_exists():
file_path = self._get_code_package_file_path(file_name)
if file_path.exists():
return file_path
yield self._get_code_package_file_path(file_name)

# Fall back to config.json-based location
# 3. <config_dir>/<file_folder>/<file_name> via config.json discovery
config_path = self._find_config_file()
if config_path:
file_path = self._get_config_based_file_path(file_name, config_path)
if file_path.exists():
return file_path

# Return the file name as a Path if not found in any location
return Path(file_name)
if config_path is not None:
yield self._get_config_based_file_path(file_name, config_path)

def _code_package_exists(self) -> bool:
"""Check if the default code package directory exists.
Expand Down Expand Up @@ -146,15 +140,18 @@ def _find_config_file(self) -> Optional[Path]:
def _get_config_based_file_path(self, file_name: str, config_path: Path) -> Path:
"""Get the file path relative to the config file location.

Anchors on the directory containing the discovered ``config.json`` so a
package found by walking up from cwd resolves files relative to its own
root, not the caller's cwd.

Args:
file_name: The name of the file
config_path: The path to the config file

Returns:
The full path to the file
"""
relative_path = f"{self.file_folder}/{file_name}"
return Path(relative_path)
return config_path.parent / self.file_folder / file_name

def _find_file_in_tree(self, filename: str, search_path: Path) -> Optional[Path]:
"""Find a file within a directory tree.
Expand Down
Loading
Loading