Source code for metacsv.io.converters

'''
Utilities for converting between metacsv-compatible data formats
'''

from __future__ import absolute_import, division, print_function, \
    with_statement, unicode_literals

import pandas as pd
import numpy as np
import xarray as xr
from collections import OrderedDict

from metacsv.io.to_xarray import (
    metacsv_series_to_dataarray,
    metacsv_series_to_dataset,
    metacsv_dataframe_to_dataset,
    metacsv_dataframe_to_dataarray)

from metacsv.io.to_csv import (
    metacsv_to_csv,
    metacsv_to_header,
    _header_to_file_object)

from metacsv.io.parsers import read_csv
from metacsv.core.containers import Series, DataFrame, Panel
from metacsv.core.internals import Coordinates, Variables, Attributes
from metacsv._compat import string_types, stream_types, BytesIO, StringIO


def _coerce_to_metacsv(container, *args, **kwargs):

    if not isinstance(container, (Series, DataFrame, Panel)):
        if isinstance(container, (string_types, stream_types)):
            container = read_csv(container, *args, **kwargs)
        elif isinstance(container, pd.Series):
            container = Series(container)
        elif isinstance(container, pd.DataFrame):
            container = DataFrame(container)
        elif isinstance(container, pd.Panel):
            container = Panel(container)
        elif isinstance(container, (xr.DataArray, xr.Dataset)):
            raise NotImplementedError(
                'automatic coersion of xarray objects not implemented')
        else:
            raise TypeError(
                'Unknown data type. Must be a Series, DataFrame, or Panel')

    return container


def _parse_args(container, attrs, coords, variables):

    if attrs is not None:
        if hasattr(container, 'attrs') and container.attrs == None:
            container.attrs = attrs
        else:
            container.attrs.update(attrs)

    if coords is not None:
        if hasattr(container, 'coords') and container.coords == None:
            container.add_coords()
            container.coords = coords
        else:
            container.coords.update(coords)

    if variables is not None:
        if hasattr(container, 'variables') and container.variables == None:
            container.variables = variables
        else:
            container.variables.update(variables)


[docs]def to_dataset(container, attrs=None, coords=None, variables=None, *args, **kwargs): ''' Convert a CSV, Series, DataFrame, Panel, DataArray, or Dataset to an :py:class:`xarray.Dataset` .. note :: If a Series is passed, the variable will be named 'data'. to_dataset is not implemented for Panel data. Parameters ---------- container : object A pandas or metacsv Series, DataFrame, or Panel, an xarray DataArray or Dataset, or a filepath to a csv or netcdf file. attrs : dict Container attributes coords : dict Container coordinates variables : dict Variable-specific attributes *args : Additional positional arguments passed to metacsv.read_csv if container is a filepath **kwargs : Additional keyword arguments passed to metacsv.read_csv if container is a filepath Example ------- .. code-block:: python >>> np.random.seed(1) >>> >>> to_dataset( ... pd.DataFrame(np.random.random((3,4))), ... attrs={'author': 'my name'}) ... <xarray.Dataset> Dimensions: (index: 3) Coordinates: * index (index) int64 0 1 2 Data variables: 0 (index) float64 0.417 0.1468 0.3968 1 (index) float64 0.7203 0.09234 0.5388 2 (index) float64 0.0001144 0.1863 0.4192 3 (index) float64 0.3023 0.3456 0.6852 Attributes: author: my name ''' container = _coerce_to_metacsv(container, *args, **kwargs) _parse_args(container, attrs, coords, variables) if len(container.shape) == 1: return metacsv_series_to_dataset(container) elif len(container.shape) == 2: return metacsv_dataframe_to_dataset(container) elif len(container.shape) > 2: raise NotImplementedError( 'to_dataarray not implemented for Panel data')
[docs]def to_dataarray(container, attrs=None, coords=None, variables=None, *args, **kwargs): ''' Convert a CSV, Series, DataFrame, Panel, DataArray, or Dataset to an :py:class:`xarray.DataArray` .. note :: If a DataFrame is passed, columns will be stacked and treated as coordinates. to_dataset is not implemented for Panel data. Parameters ---------- container : object A pandas or metacsv Series, DataFrame, or Panel, an xarray DataArray or Dataset, or a filepath to a csv or netcdf file. attrs : dict Container attributes coords : dict Container coordinates variables : dict Variable-specific attributes *args : Additional positional arguments passed to metacsv.read_csv if container is a filepath **kwargs : Additional keyword arguments passed to metacsv.read_csv if container is a filepath Example ------- .. code-block:: python >>> np.random.seed(1) >>> to_dataarray( ... pd.DataFrame(np.random.random((3,4)), index=list('ABC')), ... attrs={'author': 'my name'}) # doctest: +SKIP ... <xarray.DataArray (ind_0: 3, coldim_0: 4)> array([[ 4.17022005e-01, 7.20324493e-01, 1.14374817e-04, 3.02332573e-01], [ 1.46755891e-01, 9.23385948e-02, 1.86260211e-01, 3.45560727e-01], [ 3.96767474e-01, 5.38816734e-01, 4.19194514e-01, 6.85219500e-01]]) Coordinates: * ind_0 (ind_0) object 'A' 'B' 'C' * coldim_0 (coldim_0) int64 0 1 2 3 Attributes: author: my name ''' container = _coerce_to_metacsv(container, *args, **kwargs) _parse_args(container, attrs, coords, variables) if len(container.shape) == 1: return metacsv_series_to_dataarray(container) elif len(container.shape) == 2: return metacsv_dataframe_to_dataarray(container) elif len(container.shape) > 2: raise NotImplementedError( 'to_dataarray not implemented for Panel data')
[docs]def to_xarray(container, attrs=None, coords=None, variables=None, *args, **kwargs): ''' Convert a Series to an xarray.DataArray and a CSV or DataFrame to an xArray.Dataset .. note :: If a DataFrame is passed, columns will be stacked and treated as coordinates. to_dataset is not implemented for Panel data. Parameters ---------- container : object A pandas or metacsv Series, DataFrame, or Panel, an xarray DataArray or Dataset, or a filepath to a csv or netcdf file. attrs : dict Container attributes coords : dict Container coordinates variables : dict Variable-specific attributes *args : Additional positional arguments passed to metacsv.read_csv if container is a filepath **kwargs : Additional keyword arguments passed to metacsv.read_csv if container is a filepath Example ------- .. code-block:: python >>> import metacsv >>> import numpy as np, pandas as pd >>> >>> np.random.seed(1) >>> >>> df = metacsv.DataFrame( ... np.random.random((3,4)), columns=['col'+str(i) for i in range(4)]) >>> df.index = pd.MultiIndex.from_tuples([('a','X'),('b','Y'),('c','Z')], ... names=['abc','xyz']) >>> df.attrs={'author': 'my name'} >>> df.coords = {'abc': None, 'xyz': ['abc']} >>> df # doctest: +NORMALIZE_WHITESPACE <metacsv.core.containers.DataFrame (3, 4)> col0 col1 col2 col3 abc xyz a X 0.417022 0.720324 0.000114 0.302333 b Y 0.146756 0.092339 0.186260 0.345561 c Z 0.396767 0.538817 0.419195 0.685220 <BLANKLINE> Coordinates * abc (abc) object a, b, c xyz (abc) object X, Y, Z Attributes author: my name >>> to_xarray(df) # doctest: +SKIP <xarray.Dataset> Dimensions: (abc: 3) Coordinates: * abc (abc) object 'a' 'b' 'c' xyz (abc) object 'X' 'Y' 'Z' Data variables: col0 (abc) float64 0.417 0.1468 0.3968 col1 (abc) float64 0.7203 0.09234 0.5388 col2 (abc) float64 0.0001144 0.1863 0.4192 col3 (abc) float64 0.3023 0.3456 0.6852 Attributes: author: my name ''' container = _coerce_to_metacsv(container, *args, **kwargs) _parse_args(container, attrs, coords, variables) if len(container.shape) == 1: return to_dataarray(container) elif len(container.shape) == 2: return to_dataset(container) elif len(container.shape) > 2: raise NotImplementedError( 'to_dataarray not implemented for Panel data')
[docs]def to_pandas(container, *args, **kwargs): ''' Write a metacsvobject to a pandas :py:class:`~pandas.Series`, :py:class:`~pandas.DataFrame`, or :py:class:`~pandas.Panel` Parameters ---------- container : object A pandas or metacsv Series, DataFrame, or Panel, an xarray DataArray or Dataset, or a filepath to a csv or netcdf file. *args : Additional positional arguments passed to metacsv.to_csv **kwargs : Additional keyword arguments passed to metacsv.to_csv Example ------- .. code-block:: python >>> import metacsv >>> import numpy as np, pandas as pd >>> >>> np.random.seed(1) >>> >>> df = metacsv.DataFrame( ... np.random.random((3,4)), ... columns=['col'+str(i) for i in range(4)]) ... >>> df.index = pd.MultiIndex.from_tuples( ... [('a','X'),('b','Y'),('c','Z')], names=['abc','xyz']) ... >>> df.attrs={'author': 'my name'} >>> df.coords = {'abc': None, 'xyz': ['abc']} >>> df # doctest: +SKIP <metacsv.core.containers.DataFrame (3, 4)> col0 col1 col2 col3 abc xyz a X 0.328389 0.598790 0.299902 0.265052 b Y 0.720712 0.617109 0.331346 0.558522 c Z 0.954494 0.143843 0.058968 0.069010 Coordinates * abc (abc) object a, b, c xyz (abc) object X, Y, Z Attributes author: my name >>> to_pandas(df) # doctest: +SKIP col0 col1 col2 col3 abc xyz a X 0.328389 0.598790 0.299902 0.265052 b Y 0.720712 0.617109 0.331346 0.558522 c Z 0.954494 0.143843 0.058968 0.069010 ''' if not hasattr(container, 'pandas_parent'): container = _coerce_to_metacsv(container, *args, **kwargs) return container.pandas_parent(container)
[docs]def to_netcdf(container, fp, attrs=None, coords=None, variables=None, *args, **kwargs): ''' Convert a CSV, Series, DataFrame, Panel, DataArray, or Dataset to a NetCDF file .. note :: If a DataFrame is passed, columns will be stacked and treated as coordinates. to_dataset is not implemented for Panel data. Parameters ---------- container : object A pandas or metacsv Series, DataFrame, or Panel, an xarray DataArray or Dataset, or a filepath to a csv or netcdf file. attrs : dict Container attributes coords : dict Container coordinates variables : dict Variable-specific attributes *args : Additional positional arguments passed to metacsv.read_csv if container is a filepath **kwargs : Additional keyword arguments passed to metacsv.read_csv if container is a filepath Example ------- .. code-block:: python >>> np.random.seed(1) >>> >>> to_netcdf( ... pd.DataFrame(np.random.random((3,4)), columns=list('ABCD')), ... 'test.nc', ... attrs={'author': 'my name'}) ... >>> xr.open_dataset('test.nc') <xarray.Dataset> Dimensions: (index: 3) Coordinates: * index (index) int64 0 1 2 Data variables: A (index) float64 0.417 0.1468 0.3968 B (index) float64 0.7203 0.09234 0.5388 C (index) float64 0.0001144 0.1863 0.4192 D (index) float64 0.3023 0.3456 0.6852 Attributes: author: my name ''' to_dataset(container, attrs=attrs, coords=coords, variables=variables, *args, **kwargs).to_netcdf(fp)
[docs]def to_csv(container, fp, attrs=None, coords=None, variables=None, header_file=None, *args, **kwargs): r''' Write a CSV, Series, DataFrame, Panel, DataArray, or Dataset to a metacsv-formatted csv .. note :: If a DataFrame is passed, columns will be stacked and treated as coordinates. to_dataset is not implemented for Panel data. Parameters ---------- container : object A pandas or metacsv Series, DataFrame, or Panel, an xarray DataArray or Dataset, or a filepath to a csv or netcdf file. fp : str Path to which to write the metacsv-formatted CSV attrs : dict Container attributes coords : dict Container coordinates variables : dict Variable-specific attributes header_file : str_or_buffer A separate metacsv-formatted header file *args : Additional positional arguments passed to metacsv.read_csv if container is a filepath **kwargs : Additional keyword arguments passed to metacsv.read_csv if container is a filepath Example ------- .. code-block:: python >>> np.random.seed(1) >>> index = pd.MultiIndex.from_tuples( ... [('X', 1), ('X', 2), ('Y', 1)], ... names=['alpha', 'beta']) ... >>> df = pd.DataFrame( ... np.random.random((3,4)), ... index=index, ... columns=list('ABCD')) ... >>> to_csv( ... df, ... fp='my-metacsv-data.csv', ... attrs={'author': 'my name'}, ... coords=['alpha', 'beta']) ... This metacsv-formatted CSV can be then used by metacsv or converted using any of the converters in this module: ... code-block:: python >>> to_xarray('my-metacsv-data.csv') <xarray.Dataset> Dimensions: (alpha: 2, beta: 2) Coordinates: * alpha (alpha) object 'X' 'Y' * beta (beta) int64 1 2 Data variables: A (alpha, beta) float64 0.417 0.1468 0.3968 nan B (alpha, beta) float64 0.7203 0.09234 0.5388 nan C (alpha, beta) float64 0.0001144 0.1863 0.4192 nan D (alpha, beta) float64 0.3023 0.3456 0.6852 nan Attributes: author: my name ''' container = _coerce_to_metacsv(container, header_file=header_file).copy() _parse_args(container, attrs, coords, variables) metacsv_to_csv(container, fp, *args, **kwargs)
[docs]def to_header(fp, container=None, attrs=None, coords=None, variables=None, *args, **kwargs): ''' Write metacsv attributes directly to a metacsv-formatted header file Parameters ---------- fp : str Path to which to write the metacsv-formatted header file container : object A metacsv Series, DataFrame, or Panel, or a metacsv-formatted csv file from which to derive attrs, coords, and variables (optional) attrs : dict Attributes to write to header file (optional). If container is also supplied, these attrs will update the attrs dict on the provided container. coords : dict Coordinates to write to header file (optional). If container is also supplied, these coords will update the coords dict on the provided container. variables : dict Variable metadata to write to header file (optional). If container is also supplied, these variable metadata will update the variables dict on the provided container. *args : Additional positional arguments passed to metacsv.read_csv if container is a filepath **kwargs : Additional keyword arguments passed to metacsv.read_csv if container is a filepath Example ------- .. code-block:: python >>> to_header('mycsv.header', attrs={'author': 'me'}, coords='index') ''' if container is not None: container = _coerce_to_metacsv(container, *args, **kwargs).copy() _parse_args(container, attrs, coords, variables) attrs = container.attrs coords = container.coords variables = container.variables else: if not isinstance(attrs, Attributes): attrs = Attributes(attrs) if not isinstance(coords, Coordinates): coords = Coordinates(coords) if not isinstance(variables, Variables): variables = Variables(variables) metacsv_to_header(fp, attrs=attrs, coords=coords, variables=variables)