earth-mover · ianhi · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -76,3 +76,5 @@ ENV/
 
 # MkDocs documentation
 site*/
+
+icechunk-local
diff --git a/docs/.readthedocs.yaml b/docs/.readthedocs.yaml
@@ -4,6 +4,7 @@ build:
   os: ubuntu-24.04
   tools:
     python: "3"
+    rust: "latest"
 
   jobs:
     post_create_environment:
@@ -14,7 +15,11 @@ build:
       - poetry config virtualenvs.create false
     post_install:
       # Install deps and build using poetry
-      - . "$READTHEDOCS_VIRTUALENV_PATH/bin/activate" && cd docs && poetry install
+      - . "$READTHEDOCS_VIRTUALENV_PATH/bin/activate" && cd docs && poetry install && cd ../icechunk-python && maturin develop && cd ../docs
+# python:
+#   install:
+#    - method: pip
+#      path: icechunk-python
 
 mkdocs:
   configuration: docs/mkdocs.yml
diff --git a/docs/docs/icechunk-python/dask.md b/docs/docs/icechunk-python/dask.md
@@ -5,23 +5,16 @@ However, because of how Icechunk works, it's not possible to use the existing [`
 
 Instead, Icechunk provides its own specialized functions to make distributed writes with Dask and Xarray.
 This page explains how to use these specialized functions.
-!!! note
 
-    Using Xarray, Dask, and Icechunk requires `icechunk>=0.1.0a5`, `dask>=2024.11.0`, and `xarray>=2024.11.0`.
 
+Start with an icechunk store and dask arrays.
 
-First let's start a distributed Client and create an IcechunkStore.
-
-```python
-# initialize a distributed Client
-from distributed import Client
-
-client = Client()
-
-# initialize the icechunk store
+```python exec="on" session="dask" source="material-block"
 import icechunk
+import tempfile
 
-storage = icechunk.local_filesystem_storage("./icechunk-dask")
+# initialize the icechunk store
+storage = icechunk.local_filesystem_storage(tempfile.TemporaryDirectory().name)
 icechunk_repo = icechunk.Repository.create(storage)
 icechunk_session = icechunk_repo.writable_session("main")
 ```
@@ -33,15 +26,15 @@ The API follows that of [`dask.array.store`](https://docs.dask.org/en/stable/gen
 support for the `compute` kwarg.
 
 First create a dask array to write:
-```python
+```python exec="on" session="dask" source="material-block"
 import dask.array as da
 shape = (100, 100)
 dask_chunks = (20, 20)
-dask_array = dask.array.random.random(shape, chunks=dask_chunks)
+dask_array = da.random.random(shape, chunks=dask_chunks)
 ```
 
 Now create the Zarr array you will write to.
-```python
+```python exec="on" session="dask" source="material-block"
 import zarr
 
 zarr_chunks = (10, 10)
@@ -60,33 +53,48 @@ write task is independent, and will not conflict. It is your responsibility to e
 conflicts are avoided.
 
 Now write
-```python
+```python exec="on" session="dask" source="material-block" result="code"
 import icechunk.dask
 
-icechunk.dask.store_dask(icechunk_session, sources=[dask_array], targets=[zarray])
+icechunk.dask.store_dask(
+    icechunk_session,
+    sources=[dask_array],
+    targets=[zarray]
+)
 ```
 
 Finally commit your changes!
-```python
-icechunk_session.commit("wrote a dask array!")
+```python exec="on" session="dask" source="material-block"
+print(icechunk_session.commit("wrote a dask array!"))
 ```
 
 
 ## Distributed
 
 In distributed contexts where the Session, and Zarr Array objects are sent across the network,
-you must opt-in to successful pickling of a writable store.
-
+you must opt-in to successful pickling of a writable store. This will happen when you have initialized a dask
+cluster. This will be case if you have initialized a  `distributed.Client`.
 [`icechunk.dask.store_dask`](./reference.md#icechunk.dask.store_dask) takes care of the hard bit of
 merging Sessions but it is required that you opt-in to pickling prior to creating the target Zarr array objects.
 
 Here is an example:
-```python
+
+```python exec="on" session="dask" source="material-block" result="code"
+
+from distributed import Client
+client = Client()
+
 import icechunk.dask
 
+# start a new session. Old session is readonly after committing
+
+icechunk_session = icechunk_repo.writable_session("main")
 zarr_chunks = (10, 10)
 with icechunk_session.allow_pickling():
-    group = zarr.group(store=icechunk_sesion.store, overwrite=True)
+    group = zarr.group(
+        store=icechunk_session.store,
+        overwrite=True
+    )
 
     zarray = group.create_array(
         "array",
@@ -95,8 +103,13 @@ with icechunk_session.allow_pickling():
         dtype="f8",
         fill_value=float("nan"),
     )
-    icechunk.dask.store_dask(icechunk_session, sources=[dask_array], targets=[zarray])
-icechunk_session.commit("wrote a dask array!")
+
+    icechunk.dask.store_dask(
+        icechunk_session,
+        sources=[dask_array],
+        targets=[zarray]
+    )
+print(icechunk_session.commit("wrote a dask array!"))
 ```
 
 ## Icechunk + Dask + Xarray
@@ -113,20 +126,31 @@ Notably the ``compute`` kwarg is not supported.
 
 
 Now roundtrip an xarray dataset
-```python
+
+```python exec="on" session="dask" source="material-block" result="code"
 import icechunk.xarray
 import xarray as xr
 
-# Assuming you have a valid writable Session named icechunk_session
-dataset = xr.tutorial.open_dataset("rasm", chunks={"time": 1}).isel(time=slice(24))
+icechunk_session = icechunk_repo.writable_session("main")
+dataset = xr.tutorial.open_dataset(
+    "rasm",
+    chunks={"time": 1}).isel(time=slice(24)
+    )
 
-icechunk.xarray.to_icechunk(dataset, session)
+# `to_icechunk` takes care of "allow_pickling" for you
+icechunk.xarray.to_icechunk(dataset, icechunk_session, mode="w")
 
-roundtripped = xr.open_zarr(icechunk_session.store, consolidated=False)
-dataset.identical(roundtripped)
+with icechunk_session.allow_pickling():
+    roundtripped = xr.open_zarr(icechunk_session.store, consolidated=False)
+    print(dataset.identical(roundtripped))
 ```
 
 Finally commit your changes!
-```python
-icechunk_session.commit("wrote an Xarray dataset!")
+```python exec="on" session="dask" source="material-block" result="code"
+print(icechunk_session.commit("wrote an Xarray dataset!"))
+```
+
+```python exec="on" session="dask"
+# handy when running mkdocs serve locally
+client.shutdown();
 ```
diff --git a/docs/docs/icechunk-python/parallel.md b/docs/docs/icechunk-python/parallel.md
@@ -16,38 +16,38 @@ including those executed remotely in a multi-processing or any other remote exec
 Here is how you can execute such writes with Icechunk, illustrate with a `ThreadPoolExecutor`.
 First read some example data, and create an Icechunk Repository.
 
-```python
+```python exec="on" session="parallel" source="material-block"
 import xarray as xr
 import tempfile
 from icechunk import Repository, local_filesystem_storage
 
 ds = xr.tutorial.open_dataset("rasm").isel(time=slice(24))
-repo = Repository.create(local_filesystem_storage(tempfile.mkdtemp()))
+repo = Repository.create(local_filesystem_storage(tempfile.TemporaryDirectory().name))
 session = repo.writable_session("main")
 ```
 
 We will orchestrate so that each task writes one timestep.
 This is an arbitrary choice but determines what we set for the Zarr chunk size.
 
-```python
+```python exec="on" session="parallel" source="material-block" result="code"
 chunks = {1 if dim == "time" else ds.sizes[dim] for dim in ds.Tair.dims}
 ```
 
 Initialize the dataset using [`Dataset.to_zarr`](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_zarr.html)
 and `compute=False`, this will NOT write any chunked array data, but will write all array metadata, and any
 in-memory arrays (only `time` in this case).
 
-```python
+```python exec="on" session="parallel" source="material-block"
 ds.to_zarr(session.store, compute=False, encoding={"Tair": {"chunks": chunks}}, mode="w")
 # this commit is optional, but may be useful in your workflow
-session.commit("initialize store")
+print(session.commit("initialize store"))
 ```
 
 ## Multi-threading
 
 First define a function that constitutes one "write task".
 
-```python
+```python exec="on" session="parallel" source="material-block"
 from icechunk import Session
 
 def write_timestamp(*, itime: int, session: Session) -> None:
@@ -59,22 +59,22 @@ def write_timestamp(*, itime: int, session: Session) -> None:
 
 Now execute the writes.
 
+<!-- ```python exec="on" session="parallel" source="material-block" result="code" -->
 ```python
 from concurrent.futures import ThreadPoolExecutor, wait
-from icechunk.distributed import merge_sessions
 
 session = repo.writable_session("main")
 with ThreadPoolExecutor() as executor:
     # submit the writes
     futures = [executor.submit(write_timestamp, itime=i, session=session) for i in range(ds.sizes["time"])]
     wait(futures)
 
-session.commit("finished writes")
+print(session.commit("finished writes"))
 ```
 
 Verify that the writes worked as expected:
 
-```python
+```python exec="on" session="parallel" source="material-block" result="code"
 ondisk = xr.open_zarr(repo.readonly_session("main").store, consolidated=False)
 xr.testing.assert_identical(ds, ondisk)
 ```
@@ -134,7 +134,7 @@ with ProcessPoolExecutor() as executor:
 
 # manually merge the remote sessions in to the local session
 session = merge_sessions(session, *sessions)
-session.commit("finished writes")
+print(session.commit("finished writes"))
 ```
 
 Verify that the writes worked as expected:

diff --git a/docs/docs/icechunk-python/quickstart.md b/docs/docs/icechunk-python/quickstart.md
@@ -57,9 +57,10 @@ However, you can also create a repo on your local filesystem.
 
 === "Local Storage"
 
-    ```python
+    ```python exec="on" session="quickstart" source="material-block"
     import icechunk
-    storage = icechunk.local_filesystem_storage("./icechunk-local")
+    import tempfile
+    storage = icechunk.local_filesystem_storage(tempfile.TemporaryDirectory().name)
     repo = icechunk.Repository.create(storage)
     ```
 
@@ -68,13 +69,13 @@ However, you can also create a repo on your local filesystem.
 Once the repository is created, we can use `Session`s to read and write data. Since there is no data in the repository yet,
 let's create a writable session on the default `main` branch.
 
-```python
+```python exec="on" session="quickstart" source="material-block"
 session = repo.writable_session("main")
 ```
 
 Now that we have a session, we can access the `IcechunkStore` from it to interact with the underlying data using `zarr`:
 
-```python
+```python exec="on" session="quickstart" source="material-block"
 store = session.store  # A zarr store
 ```
 
@@ -83,22 +84,23 @@ store = session.store  # A zarr store
 We can now use our Icechunk `store` with Zarr.
 Let's first create a group and an array within it.
 
-```python
+```python exec="on" session="quickstart" source="material-block"
 import zarr
 group = zarr.group(store)
 array = group.create("my_array", shape=10, dtype='int32', chunks=(5,))
 ```
 
 Now let's write some data
 
-```python
+```python exec="on" session="quickstart" source="material-block"
 array[:] = 1
 ```
 
 Now let's commit our update using the session
 
-```python
-session.commit("first commit")
+```python exec="on" session="quickstart" source="material-block" result="code"
+snapshot_id_1 = session.commit("first commit")
+print(snapshot_id_1)
 ```
 
 🎉 Congratulations! You just made your first Icechunk snapshot.
@@ -111,7 +113,7 @@ session.commit("first commit")
 
 At this point, we have already committed using our session, so we need to get a new session and store to make more changes.
 
-```python
+```python exec="on" session="quickstart" source="material-block"
 session_2 = repo.writable_session("main")
 store_2 = session_2.store
 group = zarr.open_group(store_2)
@@ -120,38 +122,33 @@ array = group["my_array"]
 
 Let's now put some new data into our array, overwriting the first five elements.
 
-```python
+```python exec="on" session="quickstart" source="material-block"
 array[:5] = 2
 ```
 
 ...and commit the changes
 
-```python
+```python exec="on" session="quickstart" source="material-block"
 snapshot_id_2 = session_2.commit("overwrite some values")
 ```
 
 ## Explore version history
 
 We can see the full version history of our repo:
 
-```python
-hist = list(repo.ancestry(snapshot_id=snapshot_id_2))
+```python exec="on" session="quickstart" source="material-block" result="code"
+hist = repo.ancestry(snapshot_id=snapshot_id_2)
 for ancestor in hist:
     print(ancestor.id, ancestor.message, ancestor.written_at)
-
-# Output:
-# AHC3TSP5ERXKTM4FCB5G overwrite some values 2024-10-14 14:07:27.328429+00:00
-# Q492CAPV7SF3T1BC0AA0 first commit 2024-10-14 14:07:26.152193+00:00
-# T7SMDT9C5DZ8MP83DNM0 Repository initialized 2024-10-14 14:07:22.338529+00:00
 ```
 
 ...and we can go back in time to the earlier version.
 
-```python
+```python exec="on" session="quickstart" source="material-block"
 # latest version
 assert array[0] == 2
 # check out earlier snapshot
-earlier_session = repo.readonly_session(snapshot_id=hist[1].id)
+earlier_session = repo.readonly_session(snapshot_id=snapshot_id_1)
 store = earlier_session.store
 
 # get the array
Original file line number	Diff line number	Diff line change
Expand Up		@@ -76,3 +76,5 @@ ENV/

		# MkDocs documentation
		site*/

		icechunk-local