Source code for ImageLayer.HDF5

from Datasource import Datasource
import numpy as np
import h5py
import json
import os

[docs]class HDF5(Datasource):
    """ Loads images from hdf5 files

    Attributes
    ------------
    _read: list
        All load functions for :data:`_meta_files`
    """

    # All readers for _meta_files
    _read = [json.load]
    #: All extensions of files pointing to h5
    _meta_files = ['.json']

    @staticmethod
    def dtype(n):
        only_pos = dict(zip(
            map(np.dtype, ('int64', 'int32', 'int16', 'int8')),
            map(np.dtype, ('uint64', 'uint32', 'uint16', 'uint8'))
        ))
        d = n.dtype
        return only_pos.get(d,d)

[docs]    @staticmethod
    def load_tile(t_query):
        """load a single tile (image)

        Gets the image path from the \
:data:`TileQuery.RUNTIME`. ``IMAGE`` attribute.

        Gets the position of the image with the whole \
volume from :meth:`TileQuery.all_scales`, \
:meth:`TileQuery.tile_origin`, and \
:meth:`TileQuery.blocksize`.

        Arguments
        -----------
        t_query: :class:`TileQuery`
            With file path and image position

        Returns
        -----------
        np.ndarray
            An image array that may be as large \
as an entire full resolution slice of \
the whole hdf5 volume. Based on the value \
of :meth:`TileQuery.all_scales`, this array \
will likely be downsampled by to a small fraction \
of the full tile resolution.
        """
        # call superclass
        Datasource.load_tile(t_query)
        # Load data for all the h5 files
        h5_files = t_query.RUNTIME.IMAGE.SOURCE.HDF5.VALUE
        # Get all the z indices and coordinates
        z_stops = list(enumerate(zip(*h5_files)[-1]))
        z_starts = z_stops[::-1]

        # Find the region to crop
        sk,sj,si = t_query.all_scales
        [z0,y0,x0],[z1,y1,x1] = t_query.source_tile_bounds
        # Get the scaled blocksize for the output array
        zb,yb,xb = t_query.blocksize

        # get the right h5 files for the current z index
        start_z = next((i for i, z in z_starts if z <= z0), 0)
        stop_z = next((i for i, z in z_stops if z >= z1), len(z_stops))
        needed_files = [h5_files[zi] for zi in range(start_z, stop_z)]

        ####
        # Load from all needed files
        ####
        dtype = getattr(np, t_query.OUTPUT.INFO.TYPE.VALUE)
        # Make the full volume for all needed file volumes
        full_vol = np.zeros([zb, yb, xb], dtype = dtype)

        # Get the first offset
        offset_0 = needed_files[0][-1]

        # Loop through all needed h5 files
        for h5_file in needed_files:
            # Offset for this file
            z_offset = h5_file[-1]
            # Get input and output start
            iz0 = max(z0 - z_offset, 0)
            # Scale output bounds by z-scale
            oz0 = (z_offset - offset_0) // sk

            # Load the image region from the h5 file
            with h5py.File(h5_file[0]) as fd:
                # read from one file
                vol = fd[h5_file[1]]
                # Get the input and output end-bounds
                iz1 = min(z1 - z_offset, vol.shape[0])
                # Scale the output bounds by the z-scale
                dz = iz1 - iz0
                oz1 = oz0 + dz // sk
                # Get the volume from one file
                file_vol = vol[iz0:iz1:sk, y0:y1:sj, x0:x1:si]
                yf, xf = file_vol.shape[1:]
                # Add the volume to the full volume
                full_vol[oz0:oz1,:yf,:xf] = file_vol

        # Combined from all files
        return full_vol

[docs]    @staticmethod
    def load_file(h5_file):
        """ Load the needed volume from a single h5 File

        Arguments
        -----------
        t_query: :class:`TileQuery`
            With file path and image position

        """
[docs]    @staticmethod
    def preload_source(t_query):
        """load info from example tile (image)

        Calls :meth:`valid_path` to get filename and \
inner dataset path for the full h5 image volume.

        Then gets three needed values from the given \
path from the :class:`TileQuery` t_query

        Arguments
        -----------
        t_query: :class:`TileQuery`
            Only the file path is needed

        Returns
        --------
        dict
            Will be empty if :meth:`valid_path` finds\
this filname to not give a valid h5 volume.

            * :class:`RUNTIME` ``.IMAGE.BLOCK.NAME``
                (numpy.ndarray) -- 3x1 for any give tile shape
            * :class:`OUTPUT` ``.INFO.TYPE.NAME``
                (str) -- numpy dtype of any given tile
            * :class:`OUTPUT` ``.INFO.SIZE.NAME``
                (numpy.ndarray) -- 3x1 for full volume shape
        """
        # Keyword names
        output = t_query.OUTPUT.INFO
        runtime = t_query.RUNTIME.IMAGE
        k_h5 = runtime.SOURCE.HDF5.NAME
        # Get the max block size in bytes for a single tile
        max_bytes = t_query.RUNTIME.CACHE.MAX_BLOCK.VALUE
        max_bytes = int(max_bytes/64)

        # Check if path is valid
        keywords = HDF5.valid_path(t_query)
        if not keywords:
            return {}

        # Validate highest in z file name and dataset
        filename = keywords[k_h5][-1][0]
        dataset = keywords[k_h5][-1][1]
        offset = keywords[k_h5][-1][2]
        # Load properties from H5 dataset
        with h5py.File(filename,'r') as fd:
            # Get the volume
            vol = fd[dataset]
            # Get a shape for all the files
            shape = np.uint32(vol.shape)
            shape[0] += offset
            ####
            # Get a blockshape as a flat section
            ####
            # Get the bytes for a full slice
            voxel_bytes = np.uint32(vol.dtype.itemsize)
            slice_bytes = voxel_bytes * np.prod(shape[1:])
            # Get the nearest tile size under cache limit
            square_overage = np.ceil(slice_bytes / max_bytes)
            side_scalar = np.ceil(np.sqrt(square_overage))
            # Set the actual blocksize to be under the cache limit
            plane_shape = np.ceil(shape[1:] / side_scalar)
            max_block = np.r_[[64], plane_shape]
            ####
            # Get max blocksizes for different resolutions
            ####
            lo_res = 1
            # Get all block sizes by halving the max block size
            all_blocks = [shape/(2**res) for res in range(lo_res)]
            block_array = np.clip(np.ceil(all_blocks), 1, max_block)
            # return named keywords
            keywords.update({
                runtime.BLOCK.NAME: np.uint32(block_array),
                output.SIZE.NAME: np.uint32(shape),
                output.TYPE.NAME: str(HDF5.dtype(vol)),
            })
        # Combine results with parent method
        common = Datasource.preload_source(t_query)
        return dict(common, **keywords)

[docs]    @staticmethod
    def valid_path(t_query):
        """ Check if filename can access h5 data

        The filename can be a path to a json file \
that lists an h5 file and dataset path, or \
the filename can be a direct path to an h5 \
file. In either case the 'outer' file path \
directly to the h5 file and the 'inner' \
dataset path will be returned.

        Arguments
        -----------
        t_query: :class:`TileQuery`
            Only the file path is needed

        Returns
        --------
        dict
            Empty if not a valid h5 volume

            * :class:`RUNTIME` ``.IMAGE.SOURCE.HDF5.OUTER.NAME``
                (str) -- The direct filename to an hdf5 file
            * :class:`RUNTIME` ``.IMAGE.SOURCE.HDF5.INNER.NAME``
                (str) -- The datset in the file with image data
        """
        # Dereference path to hdf5 data
        k_h5 = t_query.RUNTIME.IMAGE.SOURCE.HDF5.NAME
        h5_list = HDF5.load_info(t_query)
        # load all the files
        for h5_file in h5_list:
            try:
                # Try to load one file
                with h5py.File(h5_file[0],'r') as fd:
                    if h5_file[1] not in fd.keys():
                        h5_file[1] = fd.keys()[0]
            except:
                return {}

        # sort by z start
        def z_sort(h_file):
            return h_file[-1]

        # return reverse sorted files
        return {
            k_h5: sorted(h5_list, key=z_sort)
        }


[docs]    @staticmethod
    def get_details(h5_info, file_dict):
        """ Get all needed h5 file info from a pointer file

        Arguments
        ----------
        file_dict: dict
            Contains keys for INNER, OUTER, and OFF values

        Returns
        --------
        list
            All INNER, OUTER, OFF values in a flat list
        """

        # Get values for actual hdf5 file
        outer_path = file_dict.get(h5_info.OUTER.NAME)
        inner_path = file_dict.get(h5_info.INNER.NAME)
        z_offset = file_dict.get(h5_info.OFF.NAME, 0)

        return [outer_path, inner_path, z_offset]

[docs]    @staticmethod
    def load_info(t_query):
        """ Gets the h5 volume filename and datapath

        If the t_query path has an extension in \
the :data:`_meta_files` and the file contains \
``RUNTIME.IMAGE.SOURCE.HDF5.OUTER.NAME`` \
and ``RUNTIME.IMAGE.SOURCE.HDF5.INNER.NAME`` \
keys, then the values of those keys are returned. \
If any of those statements is not true, then the \
original t_query path is returned along with the \
default dataset given by \
``RUNTIME.IMAGE.SOURCE.HDF5.INNER.VALUE``.

        Arguments
        -----------
        t_query: :class:`TileQuery`
            Only the file path is needed

        Returns
        --------
        list
            * The direct filename to an hdf5 file
            * The datset in the file with image data
        """
        # Load information about full hdf5
        h5_info = t_query.RUNTIME.IMAGE.SOURCE.HDF5
        filename = t_query.OUTPUT.INFO.PATH.VALUE
        dataset = h5_info.INNER.VALUE

        # Get all details for info
        def get_details(info):
            return HDF5.get_details(h5_info, info)

        # Load path if ends with json
        ending = os.path.splitext(filename)[1]
        if ending in HDF5._meta_files:
            # Get function to read the metainfo file
            order = HDF5._meta_files.index(ending)
            try:
                with open(filename) as infile:
                    # Read the metainfo file
                    info = HDF5._read[order](infile)
            except IOError:
                return [[filename, dataset, 0]]
            ######
            ## Handle references to multiple h5 files
            ## Get first item in list
            ######
            if isinstance(info, list):
                return map(get_details, info)
            # Get the inner dataset and the new path
            return [get_details(info)]

        return [[filename, dataset, 0]]