1558 lines
54 KiB
Python
1558 lines
54 KiB
Python
from __future__ import annotations
|
|
|
|
from collections import (
|
|
abc,
|
|
defaultdict,
|
|
)
|
|
import csv
|
|
from io import StringIO
|
|
import re
|
|
from typing import (
|
|
IO,
|
|
TYPE_CHECKING,
|
|
Any,
|
|
DefaultDict,
|
|
Literal,
|
|
cast,
|
|
final,
|
|
)
|
|
import warnings
|
|
|
|
import numpy as np
|
|
|
|
from pandas._libs import lib
|
|
from pandas._typing import Scalar
|
|
from pandas.errors import (
|
|
EmptyDataError,
|
|
ParserError,
|
|
ParserWarning,
|
|
)
|
|
from pandas.util._decorators import cache_readonly
|
|
from pandas.util._exceptions import find_stack_level
|
|
|
|
from pandas.core.dtypes.astype import astype_array
|
|
from pandas.core.dtypes.common import (
|
|
is_bool_dtype,
|
|
is_extension_array_dtype,
|
|
is_integer,
|
|
is_numeric_dtype,
|
|
is_object_dtype,
|
|
is_string_dtype,
|
|
pandas_dtype,
|
|
)
|
|
from pandas.core.dtypes.dtypes import (
|
|
CategoricalDtype,
|
|
ExtensionDtype,
|
|
)
|
|
from pandas.core.dtypes.inference import is_dict_like
|
|
|
|
from pandas.core import algorithms
|
|
from pandas.core.arrays import (
|
|
Categorical,
|
|
ExtensionArray,
|
|
)
|
|
from pandas.core.arrays.boolean import BooleanDtype
|
|
from pandas.core.indexes.api import Index
|
|
|
|
from pandas.io.common import (
|
|
dedup_names,
|
|
is_potential_multi_index,
|
|
)
|
|
from pandas.io.parsers.base_parser import (
|
|
ParserBase,
|
|
evaluate_callable_usecols,
|
|
get_na_values,
|
|
parser_defaults,
|
|
validate_parse_dates_presence,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import (
|
|
Hashable,
|
|
Iterator,
|
|
Mapping,
|
|
Sequence,
|
|
)
|
|
|
|
from pandas._typing import (
|
|
ArrayLike,
|
|
DtypeObj,
|
|
ReadCsvBuffer,
|
|
T,
|
|
)
|
|
|
|
from pandas import (
|
|
MultiIndex,
|
|
Series,
|
|
)
|
|
|
|
# BOM character (byte order mark)
|
|
# This exists at the beginning of a file to indicate endianness
|
|
# of a file (stream). Unfortunately, this marker screws up parsing,
|
|
# so we need to remove it if we see it.
|
|
_BOM = "\ufeff"
|
|
|
|
|
|
class PythonParser(ParserBase):
|
|
_no_thousands_columns: set[int]
|
|
|
|
def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
|
|
"""
|
|
Workhorse function for processing nested list into DataFrame
|
|
"""
|
|
super().__init__(kwds)
|
|
|
|
self.data: Iterator[list[str]] | list[list[Scalar]] = []
|
|
self.buf: list = []
|
|
self.pos = 0
|
|
self.line_pos = 0
|
|
|
|
self.skiprows = kwds["skiprows"]
|
|
|
|
if callable(self.skiprows):
|
|
self.skipfunc = self.skiprows
|
|
else:
|
|
self.skipfunc = lambda x: x in self.skiprows
|
|
|
|
self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
|
|
self.delimiter = kwds["delimiter"]
|
|
|
|
self.quotechar = kwds["quotechar"]
|
|
if isinstance(self.quotechar, str):
|
|
self.quotechar = str(self.quotechar)
|
|
|
|
self.escapechar = kwds["escapechar"]
|
|
self.doublequote = kwds["doublequote"]
|
|
self.skipinitialspace = kwds["skipinitialspace"]
|
|
self.lineterminator = kwds["lineterminator"]
|
|
self.quoting = kwds["quoting"]
|
|
self.skip_blank_lines = kwds["skip_blank_lines"]
|
|
|
|
# Passed from read_excel
|
|
self.has_index_names = kwds.get("has_index_names", False)
|
|
|
|
self.thousands = kwds["thousands"]
|
|
self.decimal = kwds["decimal"]
|
|
|
|
self.comment = kwds["comment"]
|
|
|
|
# Set self.data to something that can read lines.
|
|
if isinstance(f, list):
|
|
# read_excel: f is a nested list, can contain non-str
|
|
self.data = f
|
|
else:
|
|
assert hasattr(f, "readline")
|
|
# yields list of str
|
|
self.data = self._make_reader(f)
|
|
|
|
# Get columns in two steps: infer from data, then
|
|
# infer column indices from self.usecols if it is specified.
|
|
self._col_indices: list[int] | None = None
|
|
columns: list[list[Scalar | None]]
|
|
(
|
|
columns,
|
|
self.num_original_columns,
|
|
self.unnamed_cols,
|
|
) = self._infer_columns()
|
|
|
|
# Now self.columns has the set of columns that we will process.
|
|
# The original set is stored in self.original_columns.
|
|
# error: Cannot determine type of 'index_names'
|
|
(
|
|
self.columns,
|
|
self.index_names,
|
|
self.col_names,
|
|
_,
|
|
) = self._extract_multi_indexer_columns(
|
|
columns,
|
|
self.index_names,
|
|
)
|
|
|
|
# get popped off for index
|
|
self.orig_names: list[Hashable] = list(self.columns)
|
|
|
|
index_names, self.orig_names, self.columns = self._get_index_name()
|
|
if self.index_names is None:
|
|
self.index_names = index_names
|
|
|
|
if self._col_indices is None:
|
|
self._col_indices = list(range(len(self.columns)))
|
|
|
|
self._no_thousands_columns = self._set_no_thousand_columns()
|
|
|
|
if len(self.decimal) != 1:
|
|
raise ValueError("Only length-1 decimal markers supported")
|
|
|
|
@cache_readonly
|
|
def num(self) -> re.Pattern:
|
|
decimal = re.escape(self.decimal)
|
|
if self.thousands is None:
|
|
regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
|
|
else:
|
|
thousands = re.escape(self.thousands)
|
|
regex = (
|
|
rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
|
|
rf"([0-9]?(E|e)\-?[0-9]+)?$"
|
|
)
|
|
return re.compile(regex)
|
|
|
|
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> Iterator[list[str]]:
|
|
sep = self.delimiter
|
|
|
|
if sep is None or len(sep) == 1:
|
|
if self.lineterminator:
|
|
raise ValueError(
|
|
"Custom line terminators not supported in python parser (yet)"
|
|
)
|
|
|
|
class MyDialect(csv.Dialect):
|
|
delimiter = self.delimiter
|
|
quotechar = self.quotechar
|
|
escapechar = self.escapechar
|
|
doublequote = self.doublequote
|
|
skipinitialspace = self.skipinitialspace
|
|
quoting = self.quoting
|
|
lineterminator = "\n"
|
|
|
|
dia = MyDialect
|
|
|
|
if sep is not None:
|
|
dia.delimiter = sep
|
|
# Skip rows at file level before csv.reader sees them
|
|
# prevents CSV parsing errors on lines that will be discarded
|
|
if self.skiprows is not None:
|
|
while self.skipfunc(self.pos):
|
|
line = f.readline()
|
|
if not line:
|
|
break
|
|
self.pos += 1
|
|
else:
|
|
# attempt to sniff the delimiter from the first valid line,
|
|
# i.e. no comment line and not in skiprows
|
|
line = f.readline()
|
|
lines = self._check_comments([[line]])[0]
|
|
while self.skipfunc(self.pos) or not lines:
|
|
self.pos += 1
|
|
line = f.readline()
|
|
lines = self._check_comments([[line]])[0]
|
|
lines_str = cast(list[str], lines)
|
|
|
|
# since `line` was a string, lines will be a list containing
|
|
# only a single string
|
|
line = lines_str[0]
|
|
|
|
self.pos += 1
|
|
self.line_pos += 1
|
|
sniffed = csv.Sniffer().sniff(line)
|
|
dia.delimiter = sniffed.delimiter
|
|
|
|
# Note: encoding is irrelevant here
|
|
line_rdr = csv.reader(StringIO(line), dialect=dia)
|
|
self.buf.extend(list(line_rdr))
|
|
|
|
# Note: encoding is irrelevant here
|
|
reader = csv.reader(f, dialect=dia, strict=True)
|
|
|
|
else:
|
|
|
|
def _read():
|
|
line = f.readline()
|
|
pat = re.compile(sep)
|
|
|
|
yield pat.split(line.strip())
|
|
|
|
for line in f:
|
|
yield pat.split(line.strip())
|
|
|
|
reader = _read()
|
|
|
|
return reader
|
|
|
|
def read(
|
|
self, rows: int | None = None
|
|
) -> tuple[
|
|
Index | None,
|
|
Sequence[Hashable] | MultiIndex,
|
|
Mapping[Hashable, ArrayLike | Series],
|
|
]:
|
|
try:
|
|
content = self._get_lines(rows)
|
|
except StopIteration:
|
|
if self._first_chunk:
|
|
content = []
|
|
else:
|
|
self.close()
|
|
raise
|
|
|
|
# done with first read, next time raise StopIteration
|
|
self._first_chunk = False
|
|
|
|
index: Index | None
|
|
columns: Sequence[Hashable] = list(self.orig_names)
|
|
if not content: # pragma: no cover
|
|
# DataFrame with the right metadata, even though it's length 0
|
|
# error: Cannot determine type of 'index_col'
|
|
names = dedup_names(
|
|
self.orig_names,
|
|
is_potential_multi_index(
|
|
self.orig_names,
|
|
self.index_col,
|
|
),
|
|
)
|
|
index, columns, col_dict = self._get_empty_meta(
|
|
names,
|
|
self.dtype,
|
|
)
|
|
conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names)
|
|
return index, conv_columns, col_dict
|
|
|
|
# handle new style for names in index
|
|
indexnamerow = None
|
|
if self.has_index_names and sum(
|
|
int(v == "" or v is None) for v in content[0]
|
|
) == len(columns):
|
|
indexnamerow = content[0]
|
|
content = content[1:]
|
|
|
|
alldata = self._rows_to_cols(content)
|
|
data, columns = self._exclude_implicit_index(alldata)
|
|
|
|
conv_data = self._convert_data(data)
|
|
conv_data = self._do_date_conversions(columns, conv_data)
|
|
|
|
index, result_columns = self._make_index(alldata, columns, indexnamerow)
|
|
|
|
return index, result_columns, conv_data
|
|
|
|
def _exclude_implicit_index(
|
|
self,
|
|
alldata: list[np.ndarray],
|
|
) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
|
|
# error: Cannot determine type of 'index_col'
|
|
names = dedup_names(
|
|
self.orig_names,
|
|
is_potential_multi_index(
|
|
self.orig_names,
|
|
self.index_col,
|
|
),
|
|
)
|
|
|
|
offset = 0
|
|
if self._implicit_index:
|
|
offset = len(self.index_col)
|
|
|
|
len_alldata = len(alldata)
|
|
self._check_data_length(names, alldata)
|
|
|
|
return {
|
|
name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata
|
|
}, names
|
|
|
|
# legacy
|
|
def get_chunk(
|
|
self, size: int | None = None
|
|
) -> tuple[
|
|
Index | None,
|
|
Sequence[Hashable] | MultiIndex,
|
|
Mapping[Hashable, ArrayLike | Series],
|
|
]:
|
|
if size is None:
|
|
# error: "PythonParser" has no attribute "chunksize"
|
|
size = self.chunksize # type: ignore[attr-defined]
|
|
return self.read(rows=size)
|
|
|
|
def _convert_data(
|
|
self,
|
|
data: Mapping[Hashable, np.ndarray],
|
|
) -> Mapping[Hashable, ArrayLike]:
|
|
# apply converters
|
|
clean_conv = self._clean_mapping(self.converters)
|
|
clean_dtypes = self._clean_mapping(self.dtype)
|
|
|
|
# Apply NA values.
|
|
clean_na_values = {}
|
|
clean_na_fvalues = {}
|
|
|
|
if isinstance(self.na_values, dict):
|
|
for col in self.na_values:
|
|
if col is not None:
|
|
na_value = self.na_values[col]
|
|
na_fvalue = self.na_fvalues[col]
|
|
|
|
if isinstance(col, int) and col not in self.orig_names:
|
|
col = self.orig_names[col]
|
|
|
|
clean_na_values[col] = na_value
|
|
clean_na_fvalues[col] = na_fvalue
|
|
else:
|
|
clean_na_values = self.na_values
|
|
clean_na_fvalues = self.na_fvalues
|
|
|
|
return self._convert_to_ndarrays(
|
|
data,
|
|
clean_na_values,
|
|
clean_na_fvalues,
|
|
clean_conv,
|
|
clean_dtypes,
|
|
)
|
|
|
|
@final
|
|
def _convert_to_ndarrays(
|
|
self,
|
|
dct: Mapping,
|
|
na_values,
|
|
na_fvalues,
|
|
converters=None,
|
|
dtypes=None,
|
|
) -> dict[Any, np.ndarray]:
|
|
result = {}
|
|
parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns)
|
|
for c, values in dct.items():
|
|
conv_f = None if converters is None else converters.get(c, None)
|
|
if isinstance(dtypes, dict):
|
|
cast_type = dtypes.get(c, None)
|
|
else:
|
|
# single dtype or None
|
|
cast_type = dtypes
|
|
|
|
if self.na_filter:
|
|
col_na_values, col_na_fvalues = get_na_values(
|
|
c, na_values, na_fvalues, self.keep_default_na
|
|
)
|
|
else:
|
|
col_na_values, col_na_fvalues = set(), set()
|
|
|
|
if c in parse_date_cols:
|
|
# GH#26203 Do not convert columns which get converted to dates
|
|
# but replace nans to ensure to_datetime works
|
|
mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) # pyright: ignore[reportArgumentType]
|
|
np.putmask(values, mask, np.nan)
|
|
result[c] = values
|
|
continue
|
|
|
|
if conv_f is not None:
|
|
# conv_f applied to data before inference
|
|
if cast_type is not None:
|
|
warnings.warn(
|
|
(
|
|
"Both a converter and dtype were specified "
|
|
f"for column {c} - only the converter will be used."
|
|
),
|
|
ParserWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
|
|
try:
|
|
values = lib.map_infer(values, conv_f)
|
|
except ValueError:
|
|
mask = algorithms.isin(values, list(na_values)).view(np.uint8)
|
|
values = lib.map_infer_mask(values, conv_f, mask)
|
|
|
|
cvals, na_count = self._infer_types(
|
|
values,
|
|
set(col_na_values) | col_na_fvalues,
|
|
cast_type is None,
|
|
try_num_bool=False,
|
|
)
|
|
else:
|
|
is_ea = is_extension_array_dtype(cast_type)
|
|
is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
|
|
# skip inference if specified dtype is object
|
|
# or casting to an EA
|
|
try_num_bool = not (cast_type and is_str_or_ea_dtype)
|
|
|
|
# general type inference and conversion
|
|
cvals, na_count = self._infer_types(
|
|
values,
|
|
set(col_na_values) | col_na_fvalues,
|
|
cast_type is None,
|
|
try_num_bool,
|
|
)
|
|
|
|
# type specified in dtype param or cast_type is an EA
|
|
if cast_type is not None:
|
|
cast_type = pandas_dtype(cast_type)
|
|
if cast_type and (cvals.dtype != cast_type or is_ea):
|
|
if not is_ea and na_count > 0:
|
|
if is_bool_dtype(cast_type):
|
|
raise ValueError(f"Bool column has NA values in column {c}")
|
|
cvals = self._cast_types(cvals, cast_type, c)
|
|
|
|
result[c] = cvals
|
|
return result
|
|
|
|
@final
|
|
def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
|
|
"""
|
|
Cast values to specified type
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray or ExtensionArray
|
|
cast_type : np.dtype or ExtensionDtype
|
|
dtype to cast values to
|
|
column : string
|
|
column name - used only for error reporting
|
|
|
|
Returns
|
|
-------
|
|
converted : ndarray or ExtensionArray
|
|
"""
|
|
if isinstance(cast_type, CategoricalDtype):
|
|
known_cats = cast_type.categories is not None
|
|
|
|
if not is_object_dtype(values.dtype) and not known_cats:
|
|
# TODO: this is for consistency with
|
|
# c-parser which parses all categories
|
|
# as strings
|
|
values = lib.ensure_string_array(
|
|
values, skipna=False, convert_na_value=False
|
|
)
|
|
|
|
cats = Index(values, copy=False).unique().dropna()
|
|
values = Categorical._from_inferred_categories(
|
|
cats, cats.get_indexer(values), cast_type, true_values=self.true_values
|
|
)
|
|
|
|
# use the EA's implementation of casting
|
|
elif isinstance(cast_type, ExtensionDtype):
|
|
array_type = cast_type.construct_array_type()
|
|
try:
|
|
if isinstance(cast_type, BooleanDtype):
|
|
# error: Unexpected keyword argument "true_values" for
|
|
# "_from_sequence_of_strings" of "ExtensionArray"
|
|
values_str = [str(val) for val in values]
|
|
return array_type._from_sequence_of_strings( # type: ignore[call-arg]
|
|
values_str,
|
|
dtype=cast_type,
|
|
true_values=self.true_values, # pyright: ignore[reportCallIssue]
|
|
false_values=self.false_values, # pyright: ignore[reportCallIssue]
|
|
none_values=self.na_values, # pyright: ignore[reportCallIssue]
|
|
)
|
|
else:
|
|
return array_type._from_sequence_of_strings(values, dtype=cast_type)
|
|
except NotImplementedError as err:
|
|
raise NotImplementedError(
|
|
f"Extension Array: {array_type} must implement "
|
|
"_from_sequence_of_strings in order to be used in parser methods"
|
|
) from err
|
|
|
|
elif isinstance(values, ExtensionArray):
|
|
values = values.astype(cast_type, copy=False)
|
|
elif issubclass(cast_type.type, str):
|
|
# TODO: why skipna=True here and False above? some tests depend
|
|
# on it here, but nothing fails if we change it above
|
|
# (as no tests get there as of 2022-12-06)
|
|
values = lib.ensure_string_array(
|
|
values, skipna=True, convert_na_value=False
|
|
)
|
|
else:
|
|
try:
|
|
values = astype_array(values, cast_type, copy=True)
|
|
except ValueError as err:
|
|
raise ValueError(
|
|
f"Unable to convert column {column} to type {cast_type}"
|
|
) from err
|
|
return values
|
|
|
|
@cache_readonly
|
|
def _have_mi_columns(self) -> bool:
|
|
if self.header is None:
|
|
return False
|
|
|
|
header = self.header
|
|
if isinstance(header, (list, tuple, np.ndarray)):
|
|
return len(header) > 1
|
|
else:
|
|
return False
|
|
|
|
def _infer_columns(
|
|
self,
|
|
) -> tuple[list[list[Scalar | None]], int, set[Scalar | None]]:
|
|
names = self.names
|
|
num_original_columns = 0
|
|
clear_buffer = True
|
|
unnamed_cols: set[Scalar | None] = set()
|
|
|
|
if self.header is not None:
|
|
header = self.header
|
|
have_mi_columns = self._have_mi_columns
|
|
|
|
if isinstance(header, (list, tuple, np.ndarray)):
|
|
# we have a mi columns, so read an extra line
|
|
if have_mi_columns:
|
|
header = [*list(header), header[-1] + 1]
|
|
else:
|
|
header = [header]
|
|
|
|
columns: list[list[Scalar | None]] = []
|
|
for level, hr in enumerate(header):
|
|
try:
|
|
line = self._buffered_line()
|
|
|
|
while self.line_pos <= hr:
|
|
line = self._next_line()
|
|
|
|
except StopIteration as err:
|
|
if 0 < self.line_pos <= hr and (
|
|
not have_mi_columns or hr != header[-1]
|
|
):
|
|
# If no rows we want to raise a different message and if
|
|
# we have mi columns, the last line is not part of the header
|
|
joi = list(map(str, header[:-1] if have_mi_columns else header))
|
|
msg = f"[{','.join(joi)}], len of {len(joi)}, "
|
|
raise ValueError(
|
|
f"Passed header={msg}but only {self.line_pos} lines in file"
|
|
) from err
|
|
|
|
# We have an empty file, so check
|
|
# if columns are provided. That will
|
|
# serve as the 'line' for parsing
|
|
if have_mi_columns and hr > 0:
|
|
if clear_buffer:
|
|
self.buf.clear()
|
|
columns.append([None] * len(columns[-1]))
|
|
return columns, num_original_columns, unnamed_cols
|
|
|
|
if not self.names:
|
|
raise EmptyDataError("No columns to parse from file") from err
|
|
|
|
line = self.names[:]
|
|
|
|
this_columns: list[Scalar | None] = []
|
|
this_unnamed_cols = []
|
|
|
|
for i, c in enumerate(line):
|
|
if c == "":
|
|
if have_mi_columns:
|
|
col_name = f"Unnamed: {i}_level_{level}"
|
|
else:
|
|
col_name = f"Unnamed: {i}"
|
|
|
|
this_unnamed_cols.append(i)
|
|
this_columns.append(col_name)
|
|
else:
|
|
this_columns.append(c)
|
|
|
|
if not have_mi_columns:
|
|
counts: DefaultDict = defaultdict(int)
|
|
# Ensure that regular columns are used before unnamed ones
|
|
# to keep given names and mangle unnamed columns
|
|
col_loop_order = [
|
|
i
|
|
for i in range(len(this_columns))
|
|
if i not in this_unnamed_cols
|
|
] + this_unnamed_cols
|
|
|
|
# TODO: Use pandas.io.common.dedup_names instead (see #50371)
|
|
for i in col_loop_order:
|
|
col = this_columns[i]
|
|
old_col = col
|
|
cur_count = counts[col]
|
|
|
|
if cur_count > 0:
|
|
while cur_count > 0:
|
|
counts[old_col] = cur_count + 1
|
|
col = f"{old_col}.{cur_count}"
|
|
if col in this_columns:
|
|
cur_count += 1
|
|
else:
|
|
cur_count = counts[col]
|
|
|
|
if (
|
|
self.dtype is not None
|
|
and is_dict_like(self.dtype)
|
|
and self.dtype.get(old_col) is not None
|
|
and self.dtype.get(col) is None
|
|
):
|
|
self.dtype.update({col: self.dtype.get(old_col)})
|
|
this_columns[i] = col
|
|
counts[col] = cur_count + 1
|
|
elif have_mi_columns:
|
|
# if we have grabbed an extra line, but it's not in our
|
|
# format so save in the buffer, and create a blank extra
|
|
# line for the rest of the parsing code
|
|
if hr == header[-1]:
|
|
lc = len(this_columns)
|
|
sic = self.index_col
|
|
ic = len(sic) if sic is not None else 0
|
|
unnamed_count = len(this_unnamed_cols)
|
|
|
|
# if wrong number of blanks or no index, not our format
|
|
if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
|
|
clear_buffer = False
|
|
this_columns = [None] * lc
|
|
self.buf = [self.buf[-1]]
|
|
|
|
columns.append(this_columns)
|
|
unnamed_cols.update({this_columns[i] for i in this_unnamed_cols})
|
|
|
|
if len(columns) == 1:
|
|
num_original_columns = len(this_columns)
|
|
|
|
if clear_buffer:
|
|
self.buf.clear()
|
|
|
|
first_line: list[Scalar] | None
|
|
if names is not None:
|
|
# Read first row after header to check if data are longer
|
|
try:
|
|
first_line = self._next_line()
|
|
except StopIteration:
|
|
first_line = None
|
|
|
|
len_first_data_row = 0 if first_line is None else len(first_line)
|
|
|
|
if len(names) > len(columns[0]) and len(names) > len_first_data_row:
|
|
raise ValueError(
|
|
"Number of passed names did not match "
|
|
"number of header fields in the file"
|
|
)
|
|
if len(columns) > 1:
|
|
raise TypeError("Cannot pass names with multi-index columns")
|
|
|
|
if self.usecols is not None:
|
|
# Set _use_cols. We don't store columns because they are
|
|
# overwritten.
|
|
self._handle_usecols(columns, names, num_original_columns)
|
|
else:
|
|
num_original_columns = len(names)
|
|
if self._col_indices is not None and len(names) != len(
|
|
self._col_indices
|
|
):
|
|
columns = [[names[i] for i in sorted(self._col_indices)]]
|
|
else:
|
|
columns = [names]
|
|
else:
|
|
columns = self._handle_usecols(
|
|
columns, columns[0], num_original_columns
|
|
)
|
|
else:
|
|
ncols = len(self._header_line)
|
|
num_original_columns = ncols
|
|
|
|
if not names:
|
|
columns = [list(range(ncols))]
|
|
columns = self._handle_usecols(columns, columns[0], ncols)
|
|
elif self.usecols is None or len(names) >= ncols:
|
|
columns = self._handle_usecols([names], names, ncols)
|
|
num_original_columns = len(names)
|
|
elif not callable(self.usecols) and len(names) != len(self.usecols):
|
|
raise ValueError(
|
|
"Number of passed names did not match number of "
|
|
"header fields in the file"
|
|
)
|
|
else:
|
|
# Ignore output but set used columns.
|
|
columns = [names]
|
|
self._handle_usecols(columns, columns[0], ncols)
|
|
|
|
return columns, num_original_columns, unnamed_cols
|
|
|
|
@cache_readonly
|
|
def _header_line(self):
|
|
# Store line for reuse in _get_index_name
|
|
if self.header is not None:
|
|
return None
|
|
|
|
try:
|
|
line = self._buffered_line()
|
|
except StopIteration as err:
|
|
if not self.names:
|
|
raise EmptyDataError("No columns to parse from file") from err
|
|
|
|
line = self.names[:]
|
|
return line
|
|
|
|
def _handle_usecols(
|
|
self,
|
|
columns: list[list[Scalar | None]],
|
|
usecols_key: list[Scalar | None],
|
|
num_original_columns: int,
|
|
) -> list[list[Scalar | None]]:
|
|
"""
|
|
Sets self._col_indices
|
|
|
|
usecols_key is used if there are string usecols.
|
|
"""
|
|
col_indices: set[int] | list[int]
|
|
if self.usecols is not None:
|
|
if callable(self.usecols):
|
|
col_indices = evaluate_callable_usecols(self.usecols, usecols_key)
|
|
elif any(isinstance(u, str) for u in self.usecols):
|
|
if len(columns) > 1:
|
|
raise ValueError(
|
|
"If using multiple headers, usecols must be integers."
|
|
)
|
|
col_indices = []
|
|
|
|
for col in self.usecols:
|
|
if isinstance(col, str):
|
|
try:
|
|
col_indices.append(usecols_key.index(col))
|
|
except ValueError:
|
|
self._validate_usecols_names(self.usecols, usecols_key)
|
|
else:
|
|
col_indices.append(col)
|
|
else:
|
|
missing_usecols = [
|
|
col for col in self.usecols if col >= num_original_columns
|
|
]
|
|
if missing_usecols:
|
|
raise ParserError(
|
|
"Defining usecols with out-of-bounds indices is not allowed. "
|
|
f"{missing_usecols} are out-of-bounds.",
|
|
)
|
|
col_indices = self.usecols
|
|
|
|
columns = [
|
|
[n for i, n in enumerate(column) if i in col_indices]
|
|
for column in columns
|
|
]
|
|
self._col_indices = sorted(col_indices)
|
|
return columns
|
|
|
|
def _buffered_line(self) -> list[Scalar]:
|
|
"""
|
|
Return a line from buffer, filling buffer if required.
|
|
"""
|
|
if len(self.buf) > 0:
|
|
return self.buf[0]
|
|
else:
|
|
return self._next_line()
|
|
|
|
def _check_for_bom(self, first_row: list[Scalar]) -> list[Scalar]:
|
|
"""
|
|
Checks whether the file begins with the BOM character.
|
|
If it does, remove it. In addition, if there is quoting
|
|
in the field subsequent to the BOM, remove it as well
|
|
because it technically takes place at the beginning of
|
|
the name, not the middle of it.
|
|
"""
|
|
# first_row will be a list, so we need to check
|
|
# that that list is not empty before proceeding.
|
|
if not first_row:
|
|
return first_row
|
|
|
|
# The first element of this row is the one that could have the
|
|
# BOM that we want to remove. Check that the first element is a
|
|
# string before proceeding.
|
|
if not isinstance(first_row[0], str):
|
|
return first_row
|
|
|
|
# Check that the string is not empty, as that would
|
|
# obviously not have a BOM at the start of it.
|
|
if not first_row[0]:
|
|
return first_row
|
|
|
|
# Since the string is non-empty, check that it does
|
|
# in fact begin with a BOM.
|
|
first_elt = first_row[0][0]
|
|
if first_elt != _BOM:
|
|
return first_row
|
|
|
|
first_row_bom = first_row[0]
|
|
new_row: str
|
|
|
|
if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
|
|
start = 2
|
|
quote = first_row_bom[1]
|
|
end = first_row_bom[2:].index(quote) + 2
|
|
|
|
# Extract the data between the quotation marks
|
|
new_row = first_row_bom[start:end]
|
|
|
|
# Extract any remaining data after the second
|
|
# quotation mark.
|
|
if len(first_row_bom) > end + 1:
|
|
new_row += first_row_bom[end + 1 :]
|
|
|
|
else:
|
|
# No quotation so just remove BOM from first element
|
|
new_row = first_row_bom[1:]
|
|
|
|
new_row_list: list[Scalar] = [new_row]
|
|
return new_row_list + first_row[1:]
|
|
|
|
def _is_line_empty(self, line: Sequence[Scalar]) -> bool:
|
|
"""
|
|
Check if a line is empty or not.
|
|
|
|
Parameters
|
|
----------
|
|
line : str, array-like
|
|
The line of data to check.
|
|
|
|
Returns
|
|
-------
|
|
boolean : Whether or not the line is empty.
|
|
"""
|
|
return not line or all(not x for x in line)
|
|
|
|
def _next_line(self) -> list[Scalar]:
|
|
if isinstance(self.data, list):
|
|
while self.skipfunc(self.pos):
|
|
if self.pos >= len(self.data):
|
|
break
|
|
self.pos += 1
|
|
|
|
while True:
|
|
try:
|
|
line = self._check_comments([self.data[self.pos]])[0]
|
|
self.pos += 1
|
|
# either uncommented or blank to begin with
|
|
if not self.skip_blank_lines and (
|
|
self._is_line_empty(self.data[self.pos - 1]) or line
|
|
):
|
|
break
|
|
if self.skip_blank_lines:
|
|
ret = self._remove_empty_lines([line])
|
|
if ret:
|
|
line = ret[0]
|
|
break
|
|
except IndexError as err:
|
|
raise StopIteration from err
|
|
else:
|
|
while self.skipfunc(self.pos):
|
|
self.pos += 1
|
|
next(self.data)
|
|
|
|
while True:
|
|
orig_line = self._next_iter_line(row_num=self.pos + 1)
|
|
self.pos += 1
|
|
|
|
if orig_line is not None:
|
|
line = self._check_comments([orig_line])[0]
|
|
|
|
if self.skip_blank_lines:
|
|
ret = self._remove_empty_lines([line])
|
|
|
|
if ret:
|
|
line = ret[0]
|
|
break
|
|
elif self._is_line_empty(orig_line) or line:
|
|
break
|
|
|
|
# This was the first line of the file,
|
|
# which could contain the BOM at the
|
|
# beginning of it.
|
|
if self.pos == 1:
|
|
line = self._check_for_bom(line)
|
|
|
|
self.line_pos += 1
|
|
self.buf.append(line)
|
|
return line
|
|
|
|
def _alert_malformed(self, msg: str, row_num: int) -> None:
|
|
"""
|
|
Alert a user about a malformed row, depending on value of
|
|
`self.on_bad_lines` enum.
|
|
|
|
If `self.on_bad_lines` is ERROR, the alert will be `ParserError`.
|
|
If `self.on_bad_lines` is WARN, the alert will be printed out.
|
|
|
|
Parameters
|
|
----------
|
|
msg: str
|
|
The error message to display.
|
|
row_num: int
|
|
The row number where the parsing error occurred.
|
|
Because this row number is displayed, we 1-index,
|
|
even though we 0-index internally.
|
|
"""
|
|
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
|
|
raise ParserError(msg)
|
|
if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable(
|
|
self.on_bad_lines
|
|
):
|
|
warnings.warn(
|
|
f"Skipping line {row_num}: {msg}\n",
|
|
ParserWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
|
|
def _next_iter_line(self, row_num: int) -> list[Scalar] | None:
|
|
"""
|
|
Wrapper around iterating through `self.data` (CSV source).
|
|
|
|
When a CSV error is raised, we check for specific
|
|
error messages that allow us to customize the
|
|
error message displayed to the user.
|
|
|
|
Parameters
|
|
----------
|
|
row_num: int
|
|
The row number of the line being parsed.
|
|
"""
|
|
try:
|
|
assert not isinstance(self.data, list)
|
|
line = next(self.data)
|
|
# lie about list[str] vs list[Scalar] to minimize ignores
|
|
return line # type: ignore[return-value]
|
|
except csv.Error as e:
|
|
if self.on_bad_lines in (
|
|
self.BadLineHandleMethod.ERROR,
|
|
self.BadLineHandleMethod.WARN,
|
|
):
|
|
msg = str(e)
|
|
|
|
if "NULL byte" in msg or "line contains NUL" in msg:
|
|
msg = (
|
|
"NULL byte detected. This byte "
|
|
"cannot be processed in Python's "
|
|
"native csv library at the moment, "
|
|
"so please pass in engine='c' instead"
|
|
)
|
|
|
|
if self.skipfooter > 0:
|
|
reason = (
|
|
"Error could possibly be due to "
|
|
"parsing errors in the skipped footer rows "
|
|
"(the skipfooter keyword is only applied "
|
|
"after Python's csv library has parsed "
|
|
"all rows)."
|
|
)
|
|
msg += ". " + reason
|
|
|
|
self._alert_malformed(msg, row_num)
|
|
return None
|
|
|
|
def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
|
|
if self.comment is None:
|
|
return lines
|
|
ret = []
|
|
for line in lines:
|
|
rl = []
|
|
for x in line:
|
|
if (
|
|
not isinstance(x, str)
|
|
or self.comment not in x
|
|
or x in self.na_values
|
|
):
|
|
rl.append(x)
|
|
else:
|
|
x = x[: x.find(self.comment)]
|
|
if len(x) > 0:
|
|
rl.append(x)
|
|
break
|
|
ret.append(rl)
|
|
return ret
|
|
|
|
def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]:
|
|
"""
|
|
Iterate through the lines and remove any that are
|
|
either empty or contain only one whitespace value
|
|
|
|
Parameters
|
|
----------
|
|
lines : list of list of Scalars
|
|
The array of lines that we are to filter.
|
|
|
|
Returns
|
|
-------
|
|
filtered_lines : list of list of Scalars
|
|
The same array of lines with the "empty" ones removed.
|
|
"""
|
|
# Remove empty lines and lines with only one whitespace value
|
|
ret = [
|
|
line
|
|
for line in lines
|
|
if (
|
|
len(line) > 1
|
|
or (
|
|
len(line) == 1 and (not isinstance(line[0], str) or line[0].strip())
|
|
)
|
|
)
|
|
]
|
|
return ret
|
|
|
|
def _check_thousands(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
|
|
if self.thousands is None:
|
|
return lines
|
|
|
|
return self._search_replace_num_columns(
|
|
lines=lines, search=self.thousands, replace=""
|
|
)
|
|
|
|
def _search_replace_num_columns(
|
|
self, lines: list[list[Scalar]], search: str, replace: str
|
|
) -> list[list[Scalar]]:
|
|
ret = []
|
|
for line in lines:
|
|
rl = []
|
|
for i, x in enumerate(line):
|
|
if (
|
|
not isinstance(x, str)
|
|
or search not in x
|
|
or i in self._no_thousands_columns
|
|
or not self.num.search(x.strip())
|
|
):
|
|
rl.append(x)
|
|
else:
|
|
rl.append(x.replace(search, replace))
|
|
ret.append(rl)
|
|
return ret
|
|
|
|
def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
|
|
if self.decimal == parser_defaults["decimal"]:
|
|
return lines
|
|
|
|
return self._search_replace_num_columns(
|
|
lines=lines, search=self.decimal, replace="."
|
|
)
|
|
|
|
def _get_index_name(
|
|
self,
|
|
) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]:
|
|
"""
|
|
Try several cases to get lines:
|
|
|
|
0) There are headers on row 0 and row 1 and their
|
|
total summed lengths equals the length of the next line.
|
|
Treat row 0 as columns and row 1 as indices
|
|
1) Look for implicit index: there are more columns
|
|
on row 1 than row 0. If this is true, assume that row
|
|
1 lists index columns and row 0 lists normal columns.
|
|
2) Get index from the columns if it was listed.
|
|
"""
|
|
columns: Sequence[Hashable] = self.orig_names
|
|
orig_names = list(columns)
|
|
columns = list(columns)
|
|
|
|
line: list[Scalar] | None
|
|
if self._header_line is not None:
|
|
line = self._header_line
|
|
else:
|
|
try:
|
|
line = self._next_line()
|
|
except StopIteration:
|
|
line = None
|
|
|
|
next_line: list[Scalar] | None
|
|
try:
|
|
next_line = self._next_line()
|
|
except StopIteration:
|
|
next_line = None
|
|
|
|
# implicitly index_col=0 b/c 1 fewer column names
|
|
implicit_first_cols = 0
|
|
if line is not None:
|
|
# leave it 0, #2442
|
|
# Case 1
|
|
index_col = self.index_col
|
|
if index_col is not False:
|
|
implicit_first_cols = len(line) - self.num_original_columns
|
|
|
|
# Case 0
|
|
if (
|
|
next_line is not None
|
|
and self.header is not None
|
|
and index_col is not False
|
|
):
|
|
if len(next_line) == len(line) + self.num_original_columns:
|
|
# column and index names on diff rows
|
|
self.index_col = list(range(len(line)))
|
|
self.buf = self.buf[1:]
|
|
|
|
for c in reversed(line):
|
|
columns.insert(0, c)
|
|
|
|
# Update list of original names to include all indices.
|
|
orig_names = list(columns)
|
|
self.num_original_columns = len(columns)
|
|
return line, orig_names, columns
|
|
|
|
if implicit_first_cols > 0:
|
|
# Case 1
|
|
self._implicit_index = True
|
|
if self.index_col is None:
|
|
self.index_col = list(range(implicit_first_cols))
|
|
|
|
index_name = None
|
|
|
|
else:
|
|
# Case 2
|
|
(index_name, _, self.index_col) = self._clean_index_names(
|
|
columns, self.index_col
|
|
)
|
|
|
|
return index_name, orig_names, columns
|
|
|
|
def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
|
|
col_len = self.num_original_columns
|
|
|
|
if self._implicit_index:
|
|
col_len += len(self.index_col)
|
|
|
|
max_len = max(len(row) for row in content)
|
|
|
|
# Check that there are no rows with too many
|
|
# elements in their row (rows with too few
|
|
# elements are padded with NaN).
|
|
if max_len > col_len and self.index_col is not False and self.usecols is None:
|
|
footers = self.skipfooter if self.skipfooter else 0
|
|
bad_lines = []
|
|
|
|
iter_content = enumerate(content)
|
|
content_len = len(content)
|
|
content = []
|
|
|
|
for i, _content in iter_content:
|
|
actual_len = len(_content)
|
|
if actual_len > col_len:
|
|
if callable(self.on_bad_lines):
|
|
new_l = self.on_bad_lines(_content)
|
|
if new_l is not None:
|
|
new_l = cast(list[Scalar], new_l)
|
|
if len(new_l) > col_len:
|
|
row_num = self.pos - (content_len - i + footers)
|
|
bad_lines.append((row_num, len(new_l), "callable"))
|
|
new_l = new_l[:col_len]
|
|
content.append(new_l)
|
|
|
|
elif self.on_bad_lines in (
|
|
self.BadLineHandleMethod.ERROR,
|
|
self.BadLineHandleMethod.WARN,
|
|
):
|
|
row_num = self.pos - (content_len - i + footers)
|
|
bad_lines.append((row_num, actual_len, "normal"))
|
|
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
|
|
break
|
|
else:
|
|
content.append(_content)
|
|
|
|
for row_num, actual_len, source in bad_lines:
|
|
msg = (
|
|
f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}"
|
|
)
|
|
if source == "callable":
|
|
msg += " from bad_lines callable"
|
|
elif (
|
|
self.delimiter
|
|
and len(self.delimiter) > 1
|
|
and self.quoting != csv.QUOTE_NONE
|
|
):
|
|
# see gh-13374
|
|
reason = (
|
|
"Error could possibly be due to quotes being "
|
|
"ignored when a multi-char delimiter is used."
|
|
)
|
|
msg += ". " + reason
|
|
|
|
self._alert_malformed(msg, row_num + 1)
|
|
|
|
# see gh-13320
|
|
zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
|
|
|
|
if self.usecols:
|
|
assert self._col_indices is not None
|
|
col_indices = self._col_indices
|
|
|
|
if self._implicit_index:
|
|
zipped_content = [
|
|
a
|
|
for i, a in enumerate(zipped_content)
|
|
if (
|
|
i < len(self.index_col)
|
|
or i - len(self.index_col) in col_indices
|
|
)
|
|
]
|
|
else:
|
|
zipped_content = [
|
|
a for i, a in enumerate(zipped_content) if i in col_indices
|
|
]
|
|
return zipped_content
|
|
|
|
def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
|
|
lines = self.buf
|
|
new_rows = None
|
|
|
|
# already fetched some number
|
|
if rows is not None:
|
|
# we already have the lines in the buffer
|
|
if len(self.buf) >= rows:
|
|
new_rows, self.buf = self.buf[:rows], self.buf[rows:]
|
|
|
|
# need some lines
|
|
else:
|
|
rows -= len(self.buf)
|
|
|
|
if new_rows is None:
|
|
if isinstance(self.data, list):
|
|
if self.pos > len(self.data):
|
|
raise StopIteration
|
|
if rows is None:
|
|
new_rows = self.data[self.pos :]
|
|
new_pos = len(self.data)
|
|
else:
|
|
new_rows = self.data[self.pos : self.pos + rows]
|
|
new_pos = self.pos + rows
|
|
|
|
new_rows = self._remove_skipped_rows(new_rows)
|
|
lines.extend(new_rows)
|
|
self.pos = new_pos
|
|
|
|
else:
|
|
new_rows = []
|
|
try:
|
|
if rows is not None:
|
|
row_index = 0
|
|
row_ct = 0
|
|
offset = self.pos if self.pos is not None else 0
|
|
while row_ct < rows:
|
|
new_row = next(self.data)
|
|
if not self.skipfunc(offset + row_index):
|
|
row_ct += 1
|
|
row_index += 1
|
|
new_rows.append(new_row)
|
|
|
|
len_new_rows = len(new_rows)
|
|
new_rows = self._remove_skipped_rows(new_rows)
|
|
lines.extend(new_rows)
|
|
else:
|
|
rows = 0
|
|
|
|
while True:
|
|
next_row = self._next_iter_line(row_num=self.pos + rows + 1)
|
|
rows += 1
|
|
|
|
if next_row is not None:
|
|
new_rows.append(next_row)
|
|
len_new_rows = len(new_rows)
|
|
|
|
except StopIteration:
|
|
len_new_rows = len(new_rows)
|
|
new_rows = self._remove_skipped_rows(new_rows)
|
|
lines.extend(new_rows)
|
|
if len(lines) == 0:
|
|
raise
|
|
self.pos += len_new_rows
|
|
|
|
self.buf = []
|
|
else:
|
|
lines = new_rows
|
|
|
|
if self.skipfooter:
|
|
lines = lines[: -self.skipfooter]
|
|
|
|
lines = self._check_comments(lines)
|
|
if self.skip_blank_lines:
|
|
lines = self._remove_empty_lines(lines)
|
|
lines = self._check_thousands(lines)
|
|
return self._check_decimal(lines)
|
|
|
|
def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar]]:
|
|
if self.skiprows:
|
|
return [
|
|
row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos)
|
|
]
|
|
return new_rows
|
|
|
|
def _set_no_thousand_columns(self) -> set[int]:
|
|
no_thousands_columns: set[int] = set()
|
|
if self.columns and self.parse_dates:
|
|
assert self._col_indices is not None
|
|
no_thousands_columns = self._set_noconvert_dtype_columns(
|
|
self._col_indices, self.columns
|
|
)
|
|
if self.columns and self.dtype:
|
|
assert self._col_indices is not None
|
|
for i, col in zip(self._col_indices, self.columns, strict=True):
|
|
if not isinstance(self.dtype, dict) and not is_numeric_dtype(
|
|
self.dtype
|
|
):
|
|
no_thousands_columns.add(i)
|
|
if (
|
|
isinstance(self.dtype, dict)
|
|
and col in self.dtype
|
|
and (
|
|
not is_numeric_dtype(self.dtype[col])
|
|
or is_bool_dtype(self.dtype[col])
|
|
)
|
|
):
|
|
no_thousands_columns.add(i)
|
|
return no_thousands_columns
|
|
|
|
|
|
class FixedWidthReader(abc.Iterator):
|
|
"""
|
|
A reader of fixed-width lines.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
f: IO[str] | ReadCsvBuffer[str],
|
|
colspecs: list[tuple[int, int]] | Literal["infer"],
|
|
delimiter: str | None,
|
|
comment: str | None,
|
|
skiprows: set[int] | None = None,
|
|
infer_nrows: int = 100,
|
|
) -> None:
|
|
self.f = f
|
|
self.buffer: Iterator | None = None
|
|
self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
|
|
self.comment = comment
|
|
if colspecs == "infer":
|
|
self.colspecs = self.detect_colspecs(
|
|
infer_nrows=infer_nrows, skiprows=skiprows
|
|
)
|
|
else:
|
|
self.colspecs = colspecs
|
|
|
|
if not isinstance(self.colspecs, (tuple, list)):
|
|
raise TypeError(
|
|
"column specifications must be a list or tuple, "
|
|
f"input was a {type(colspecs).__name__}"
|
|
)
|
|
|
|
for colspec in self.colspecs:
|
|
if not (
|
|
isinstance(colspec, (tuple, list))
|
|
and len(colspec) == 2
|
|
and isinstance(colspec[0], (int, np.integer, type(None)))
|
|
and isinstance(colspec[1], (int, np.integer, type(None)))
|
|
):
|
|
raise TypeError(
|
|
"Each column specification must be "
|
|
"2 element tuple or list of integers"
|
|
)
|
|
|
|
def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
|
|
"""
|
|
Read rows from self.f, skipping as specified.
|
|
|
|
We distinguish buffer_rows (the first <= infer_nrows
|
|
lines) from the rows returned to detect_colspecs
|
|
because it's simpler to leave the other locations
|
|
with skiprows logic alone than to modify them to
|
|
deal with the fact we skipped some rows here as
|
|
well.
|
|
|
|
Parameters
|
|
----------
|
|
infer_nrows : int
|
|
Number of rows to read from self.f, not counting
|
|
rows that are skipped.
|
|
skiprows: set, optional
|
|
Indices of rows to skip.
|
|
|
|
Returns
|
|
-------
|
|
detect_rows : list of str
|
|
A list containing the rows to read.
|
|
|
|
"""
|
|
if skiprows is None:
|
|
skiprows = set()
|
|
buffer_rows = []
|
|
detect_rows = []
|
|
for i, row in enumerate(self.f):
|
|
if i not in skiprows:
|
|
detect_rows.append(row)
|
|
buffer_rows.append(row)
|
|
if len(detect_rows) >= infer_nrows:
|
|
break
|
|
self.buffer = iter(buffer_rows)
|
|
return detect_rows
|
|
|
|
def detect_colspecs(
|
|
self, infer_nrows: int = 100, skiprows: set[int] | None = None
|
|
) -> list[tuple[int, int]]:
|
|
# Regex escape the delimiters
|
|
delimiters = "".join([rf"\{x}" for x in self.delimiter])
|
|
pattern = re.compile(f"([^{delimiters}]+)")
|
|
rows = self.get_rows(infer_nrows, skiprows)
|
|
if not rows:
|
|
raise EmptyDataError("No rows from which to infer column width")
|
|
max_len = max(map(len, rows))
|
|
mask = np.zeros(max_len + 1, dtype=int)
|
|
if self.comment is not None:
|
|
rows = [row.partition(self.comment)[0] for row in rows]
|
|
for row in rows:
|
|
for m in pattern.finditer(row):
|
|
mask[m.start() : m.end()] = 1
|
|
shifted = np.roll(mask, 1)
|
|
shifted[0] = 0
|
|
edges = np.where((mask ^ shifted) == 1)[0]
|
|
edge_pairs = list(zip(edges[::2], edges[1::2], strict=True))
|
|
return edge_pairs
|
|
|
|
def __next__(self) -> list[str]:
|
|
if self.buffer is not None:
|
|
try:
|
|
line = next(self.buffer)
|
|
except StopIteration:
|
|
self.buffer = None
|
|
line = next(self.f) # type: ignore[arg-type]
|
|
else:
|
|
line = next(self.f) # type: ignore[arg-type]
|
|
# Note: 'colspecs' is a sequence of half-open intervals.
|
|
return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
|
|
|
|
|
|
class FixedWidthFieldParser(PythonParser):
|
|
"""
|
|
Specialization that Converts fixed-width fields into DataFrames.
|
|
See PythonParser for details.
|
|
"""
|
|
|
|
def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
|
|
# Support iterators, convert to a list.
|
|
self.colspecs = kwds.pop("colspecs")
|
|
self.infer_nrows = kwds.pop("infer_nrows")
|
|
PythonParser.__init__(self, f, **kwds)
|
|
|
|
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader:
|
|
return FixedWidthReader(
|
|
f,
|
|
self.colspecs,
|
|
self.delimiter,
|
|
self.comment,
|
|
self.skiprows,
|
|
self.infer_nrows,
|
|
)
|
|
|
|
def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]:
|
|
"""
|
|
Returns the list of lines without the empty ones. With fixed-width
|
|
fields, empty lines become arrays of empty strings.
|
|
|
|
See PythonParser._remove_empty_lines.
|
|
"""
|
|
return [
|
|
line
|
|
for line in lines
|
|
if any(not isinstance(e, str) or e.strip() for e in line)
|
|
]
|
|
|
|
|
|
def _validate_skipfooter_arg(skipfooter: int) -> int:
|
|
"""
|
|
Validate the 'skipfooter' parameter.
|
|
|
|
Checks whether 'skipfooter' is a non-negative integer.
|
|
Raises a ValueError if that is not the case.
|
|
|
|
Parameters
|
|
----------
|
|
skipfooter : non-negative integer
|
|
The number of rows to skip at the end of the file.
|
|
|
|
Returns
|
|
-------
|
|
validated_skipfooter : non-negative integer
|
|
The original input if the validation succeeds.
|
|
|
|
Raises
|
|
------
|
|
ValueError : 'skipfooter' was not a non-negative integer.
|
|
"""
|
|
if not is_integer(skipfooter):
|
|
raise ValueError("skipfooter must be an integer")
|
|
|
|
if skipfooter < 0:
|
|
raise ValueError("skipfooter cannot be negative")
|
|
|
|
# Incompatible return value type (got "Union[int, integer[Any]]", expected "int")
|
|
return skipfooter # type: ignore[return-value]
|