Source code for metacsv.io.parsers


from __future__ import absolute_import, division, print_function, \
    with_statement, unicode_literals

import pandas as pd
import re
from collections import OrderedDict
from .yaml_tools import ordered_load
from .._compat import string_types, has_iteritems, iteritems
from ..core.internals import Container, Attributes, Variables, Coordinates
from ..core.containers import Series, DataFrame, Panel


[docs]def find_yaml_start(line):
    return re.search(r'^\s*-{3,}\s*$', line) is not None


[docs]def find_yaml_stop(line):
    return re.search(r'^\s*\.{3,}\s*$', line) is not None


def _parse_headered_data(fp):

    # Check for a yaml parse break at the top of the file
    # if there is not one, go back to the top and read like a
    # normal CSV
    loc = fp.tell()

    nextline = ''

    while re.search(r'^[\s\n\r]*$', nextline):
        nextline = next(fp)

    if not find_yaml_start(nextline):
        fp.seek(loc)
        return OrderedDict()

    yaml_text = ''
    this_line = ''

    while not find_yaml_stop(this_line):
        yaml_text += '\n' + this_line.rstrip('\n')
        this_line = next(fp)

    header = ordered_load(yaml_text)
    

    return header

def _verify_deep_assertion(verify_par, par):
    if par is None:
        raise ValueError('Assertions failed')

    if not has_iteritems(verify_par):
        if hasattr(verify_par, '__call__'):
            assert verify_par(par)
            return
        
        else:
            assert verify_par == par
            return

    if not has_iteritems(par):
        raise ValueError('Assertions failed')

    for kw, arg in iteritems(verify_par):
        _verify_deep_assertion(arg, par[kw])


def _verify_assertions(assertions=None, attrs=None, coords=None, variables=None):
    if assertions is None:
        return

    if not has_iteritems(assertions):
        raise TypeError('assertions must be iterable')
    
    if 'attrs' in assertions:
        _verify_deep_assertion(assertions['attrs'], attrs)

    if 'coords' in assertions:
        _verify_deep_assertion(assertions['coords'], coords)

    if 'variables' in assertions:
        _verify_deep_assertion(assertions['variables'], variables)

    for kw, arg in iteritems(assertions):
        if kw in ['attrs','coords','variables']:
            continue
        _verify_deep_assertion(arg, attrs[kw])



[docs]def read_header(fp, header_file=None, parse_vars=False, assertions=None, *args, **kwargs):
    """
    Read a metacsv-formatted header

    Args:
        fp (str or buffer): csv or metacsv-formatted filepath or buffer to read

    Kwargs:
        header_file (str or buffer): optional supplemental yaml header file
        parse_vars (bool): parse compact-style variable definitions (see example)
        assertions (dict-like): dictionary of values to assert in file header

    Returns:
        args        
        variables   
        coords      

    Example:

        >>> import metacsv
        >>> import StringIO as io # import io for python 3
        >>> doc = io.StringIO('''
        ... ---
        ... author: A Person
        ... date:   2000-01-01
        ... variables:
        ...     pop:
        ...       name: Population
        ...       unit: millions
        ...     gdp:
        ...       name: Product
        ...       unit: 2005 $Bn
        ... ...
        ... other data, not csv-formatted
        ... ''')

        >>> attrs, coords, variables = metacsv.read_header(doc, index_col=[0,1])
        >>> variables # doctest: +NORMALIZE_WHITESPACE
        Variables
            gdp:
                name            Product
                unit            2005 $Bn
            pop:
                name            Population
                unit            millions

        >>> attrs # doctest: +NORMALIZE_WHITESPACE
        Attributes
            author:         A Person
            date:           2000-01-01

        >>> coords
        <Empty Coordinates>

    **parse_vars**

    The read_header argument ``parse_vars`` allows parsing of one-line variable definitions in the format ``var: description [unit]``:

    Example:

        >>> doc = io.StringIO('''
        ... ---
        ... author: A Person
        ... date:   2000-01-01
        ... variables:
        ...     pop: Population [millions]
        ...     gdp: Product [2005 $Bn]
        ... ...
        ... region,year,pop,gdp
        ... USA,2010,309.3,13599.3
        ... USA,2011,311.7,13817.0
        ... CAN,2010,34.0,1240.0
        ... CAN,2011,34.3,1276.7
        ... ''')
        
        >>> attrs, coords, variables = metacsv.read_header(doc, parse_vars=True)
        >>> variables # doctest: +NORMALIZE_WHITESPACE
        Variables
            gdp:
                description     Product
                unit            2005 $Bn
            pop:
                description     Population
                unit            millions
    """

    kwargs = dict(kwargs)

    header = OrderedDict()

    if isinstance(header_file, string_types):
        with open(header_file, 'r') as hf:
            header = ordered_load(hf.read())

    elif header_file is not None:
        header = ordered_load(hf.read())

    if isinstance(fp, string_types):
        with open(fp, 'r') as fp:
            _header = _parse_headered_data(fp)

    else:
        _header = _parse_headered_data(fp)

    header.update(_header)

    kwargs.update({'attrs': header})
    args, kwargs, special = Container.strip_special_attributes(args, kwargs)

    if parse_vars:
        if 'variables' in special:
            for key, var in special['variables'].items():
                special['variables'][key] = Variables.parse_string_var(var)

    attrs = Attributes(None if ('attrs' not in special) else special['attrs'])
    coords = Coordinates(None if ('coords' not in special) else special['coords'])
    variables = Variables(None if ('variables' not in special) else special['variables'])

    _verify_assertions(assertions, attrs=attrs, coords=coords, variables=variables)

    return attrs, coords, variables


[docs]def read_csv(fp, header_file=None, parse_vars=False, assertions=None, *args, **kwargs):
    """
    Read a csv or metacsv-formatted csv into a metacsv.DataFrame

    Args:
        fp (str or buffer): csv or metacsv-formatted filepath or buffer to read

    Kwargs:
        header_file (str or buffer): optional supplemental yaml header file
        parse_vars (bool): parse compact-style variable definitions (see example)
        assertions (dict-like): dictionary of values to assert in file header

    *args, **kwargs passed to pandas.read_csv

    Example:

        >>> import metacsv, numpy as np
        >>> import StringIO as io # import io for python 3
        >>> doc = io.StringIO('''
        ... ---
        ... author: A Person
        ... date:   2000-01-01
        ... variables:
        ...     pop:
        ...       name: Population
        ...       unit: millions
        ...     gdp:
        ...       name: Product
        ...       unit: 2005 $Bn
        ... ...
        ... region,year,pop,gdp
        ... USA,2010,309.3,13599.3
        ... USA,2011,311.7,13817.0
        ... CAN,2010,34.0,1240.0
        ... CAN,2011,34.3,1276.7
        ... ''')

        >>> df = metacsv.read_csv(doc, index_col=[0,1])
        >>> df # doctest: +NORMALIZE_WHITESPACE
        <metacsv.core.containers.DataFrame (4, 2)>
                       pop      gdp
        region year
        USA    2010  309.3  13599.3
               2011  311.7  13817.0
        CAN    2010   34.0   1240.0
               2011   34.3   1276.7
        <BLANKLINE>
        Variables
            gdp:
                name            Product
                unit            2005 $Bn
            pop:
                name            Population
                unit            millions
        Attributes
            author:         A Person
            date:           2000-01-01

    **parse_vars**

    The read-csv argument ``parse_vars`` allows parsing of one-line variable definitions in the format ``var: description [unit]``:

    Example:

        >>> doc = io.StringIO('''
        ... ---
        ... author: A Person
        ... date:   2000-01-01
        ... variables:
        ...     pop: Population [millions]
        ...     gdp: Product [2005 $Bn]
        ... ...
        ... region,year,pop,gdp
        ... USA,2010,309.3,13599.3
        ... USA,2011,311.7,13817.0
        ... CAN,2010,34.0,1240.0
        ... CAN,2011,34.3,1276.7
        ... ''')
        
        >>> metacsv.read_csv(doc, index_col=0, parse_vars=True) # doctest: +NORMALIZE_WHITESPACE
        <metacsv.core.containers.DataFrame (4, 3)>
                year    pop      gdp
        region
        USA     2010  309.3  13599.3
        USA     2011  311.7  13817.0
        CAN     2010   34.0   1240.0
        CAN     2011   34.3   1276.7
        <BLANKLINE>
        Variables
            gdp:
                description     Product
                unit            2005 $Bn
            pop:
                description     Population
                unit            millions
        Attributes
            author:         A Person
            date:           2000-01-01
    """

    kwargs = dict(kwargs)

    squeeze = kwargs.get('squeeze', False)

    # set defaults
    engine = kwargs.pop('engine', 'python')
    kwargs['engine'] = engine

    header = OrderedDict()

    if isinstance(header_file, string_types):
        with open(header_file, 'r') as hf:
            header = ordered_load(hf.read())

    elif header_file is not None:
        header = ordered_load(hf.read())

    if isinstance(fp, string_types):
        with open(fp, 'r') as fp:
            _header = _parse_headered_data(fp)
            data = pd.read_csv(fp, *args, **kwargs)

    else:
        _header = _parse_headered_data(fp)
        data = pd.read_csv(fp, *args, **kwargs)

    header.update(_header)

    kwargs.update({'attrs': header})
    args, kwargs, special = Container.strip_special_attributes(args, kwargs)

    if parse_vars:
        if 'variables' in special:
            for key, var in special['variables'].items():
                special['variables'][key] = Variables.parse_string_var(var)

    if squeeze:
        if len(data.shape) == 1:
            s = Series(data, **special)
            _verify_assertions(assertions, attrs=s.attrs, variables=s.variables, coords=s.coords)
            return s

    df = DataFrame(data, **special)

    if squeeze and df.shape[1] == 1:
        s = Series(df[df.columns[0]], **special)
        _verify_assertions(assertions, attrs=s.attrs, variables=s.variables, coords=s.coords)
        return s
    else:
        _verify_assertions(assertions, attrs=df.attrs, variables=df.variables, coords=df.coords)
        return df


[docs]def read_pickle(fp, assertions=None, *args, **kwargs):
    """
    Read a pandas or metacsv pickle file into a metacsv container

    Args:
        fp (str or buffer): ffilepath or buffer to read

    Kwargs:
        assertions (dict-like): dictionary of values to assert in file header

    *args, **kwargs passed to pandas.read_pickle
    """

    return _verify_assertions(pd.read_pickle(fp, *args, **kwargs), assertions)