Open
Description
Only for Ubuntu-latest.
chunksizes_in_mem = (1000, 1000), subsample_size = 100
cluster = LocalCluster(45e6d9e3, 'tcp://127.0.0.1:38789', workers=1, threads=1, memory=15.61 GiB)
@pytest.mark.parametrize("fn", [fn_large]) # type: ignore
@pytest.mark.parametrize("chunksizes_in_mem", [(1000, 1000), (2500, 2500)]) # type: ignore
@pytest.mark.parametrize("subsample_size", [100, 100000]) # type: ignore
def test_delayed_subsample__memusage(
self, fn: str, chunksizes_in_mem: tuple[int, int], subsample_size: int, cluster: Any
):
"""
Checks for delayed subsampling function for memory usage on big file.
(and also runs output checks as not long or too memory intensive in this case)
Variables that influence memory usage are:
- Subsample sizes,
- Chunksizes in memory.
"""
# Only check on linux
if sys.platform == "linux":
# 0/ Open dataset with chunks
ds = xr.open_dataset(fn, chunks={"x": chunksizes_in_mem[0], "y": chunksizes_in_mem[1]})
darr = ds["test"].data
# 1/ Estimation of theoretical memory usage of the subsampling script
max_op_memusage = _estimate_subsample_memusage(
darr=darr, chunksizes_in_mem=chunksizes_in_mem, subsample_size=subsample_size
)
# 2/ Run delayed subsample with dask memory usage monitoring
# Derive subsample from delayed function
# (passed to wrapper function to measure memory usage during execution)
sub, measured_op_memusage = _run_dask_measuring_memusage(
cluster, delayed_subsample, darr, subsample=subsample_size, random_state=42
)
# Check the measured memory usage is smaller than the maximum estimated one
> assert measured_op_memusage < max_op_memusage
E assert np.float64(148.85546875) < np.float64(102.48738861083984)