# biome.text.data.datasource Module

# DataSource Class


class DataSource (
    source: Union[str, List[str], NoneType] = None,
    mapping: Union[Dict[str, Union[List[str], str]], NoneType] = None,
    format: Union[str, NoneType] = None,
    reindex_with: Union[str, NoneType] = 'id',
    **reader_options,
)

This class takes care of reading the data source, usually specified in a yaml file.

It uses the source readers to extract a dask DataFrame.

Parameters

source
The data source. Could be a list of filesystem path, or a key name indicating the source backend (elasticsearch)
format
The data format. Optional. If found, overwrite the format extracted from source. Supported formats are listed as keys in the SUPPORTED_FORMATS dict of this class.
mapping
Used to map the features (columns) of the data source to the parameters of the DataSourceReader's text_to_instance method.
reindex_with : string
If reindex_with is provided, tries to reindex data with given column
**reader_options
Additional kwargs are passed on to the source readers that depend on the format (see the biome.text.data.readers module).

# add_supported_format Static method


def add_supported_format (
  format_key: str,
  parser: Callable,
  default_params: Dict[str, Any] = None,
)  -> NoneType

Add a new format and reader to the data source readers.

Parameters

format_key
The new format key
parser
The parser function
default_params
Default parameters for the parser function

# from_yaml Static method


def from_yaml(file_path: str) -> DataSource

Create a data source from a yaml file.

For the specific format, see the self.to_yaml() method.

Parameters

file_path
The path to the yaml file.

Returns

cls
 

# to_dataframe Method


def to_dataframe(self) -> dask.dataframe.core.DataFrame

Returns the underlying DataFrame of the data source

# to_mapped_dataframe Method


def to_mapped_dataframe(self) -> dask.dataframe.core.DataFrame

The columns of this DataFrame are named after the mapping keys, which in turn should match the parameter names in the DatasetReader's text_to_instance method. The content of these columns is specified in the mapping dictionary.

Returns

mapped_dataframe
Contains columns corresponding to the parameter names of the DatasetReader's text_to_instance method.

# to_yaml Method


def to_yaml (
  self,
  path: str,
  make_source_path_absolute: bool = False,
)  -> str

Create a yaml config file for this data source.

Parameters

path
Path to the yaml file to be written.
make_source_path_absolute
If true, writes the source of the DataSource as an absolute path.

Returns

path
 


def head (
  self,
  n: int = 10,
)  -> 'pandas.DataFrame'

Allows for a peek into the data source showing the first n rows.

Parameters

n
Number of lines

Returns

df
The first n lines as a pandas.DataFrame
Maintained by