Source code for datalad_next.itertools.itemize

"""Get complete items from input chunks"""

from __future__ import annotations

from typing import (
    Generator,
    Iterable,
    TypeVar,
)


__all__ = ['itemize']


T = TypeVar('T', str, bytes, bytearray)


[docs] def itemize( iterable: Iterable[T], sep: T | None, *, keep_ends: bool = False, ) -> Generator[T, None, None]: """Yields complete items (only), assembled from an iterable This function consumes chunks from an iterable and yields items defined by a separator. An item might span multiple input chunks. Input (chunks) can be ``bytes``, ``bytearray``, or ``str`` objects. The result type is determined by the type of the first input chunk. During its runtime, the type of the elements in ``iterable`` must not change. Items are defined by a separator given via ``sep``. If ``sep`` is ``None``, the line-separators built into ``str.splitlines()`` are used, and each yielded item will be a line. If ``sep`` is not `None`, its type must be compatible to the type of the elements in ``iterable``. A separator could, for example, be ``b'\\n'``, in which case the items would be terminated by Unix line-endings, i.e. each yielded item is a single line. The separator could also be, ``b'\\x00'`` (or ``'\\x00'``), to split zero-byte delimited content, like the output of ``git ls-files -z``. Separators can be longer than one byte or character, e.g. ``b'\\r\\n'``, or ``b'\\n-------------------\\n'``. Content after the last separator, possibly merged across input chunks, is always yielded as the last item, even if it is not terminated by the separator. Performance notes: - Using ``None`` as a separator (splitlines-mode) is slower than providing a specific separator. - If another separator than ``None`` is used, the runtime with ``keep_end=False`` is faster than with ``keep_end=True``. Parameters ---------- iterable: Iterable[str | bytes | bytearray] The iterable that yields the input data sep: str | bytes | bytearray | None The separator that defines items. If ``None``, the items are determined by the line-separators that are built into ``str.splitlines()``. keep_ends: bool If `True`, the item-separator will remain at the end of a yielded item. If `False`, items will not contain the separator. Preserving separators implies a runtime cost, unless the separator is ``None``. Yields ------ str | bytes | bytearray The items determined from the input iterable. The type of the yielded items depends on the type of the first element in ``iterable``. Examples -------- .. code-block:: python >>> from datalad_next.itertools import itemize >>> with open('/etc/passwd', 'rt') as f: # doctest: +SKIP ... print(tuple(itemize(iter(f.read, ''), sep=None))[0:2]) # doctest: +SKIP ('root:x:0:0:root:/root:/bin/bash', 'systemd-timesync:x:497:497:systemd Time Synchronization:/:/usr/sbin/nologin') >>> with open('/etc/passwd', 'rt') as f: # doctest: +SKIP ... print(tuple(itemize(iter(f.read, ''), sep=':'))[0:10]) # doctest: +SKIP ('root', 'x', '0', '0', 'root', '/root', '/bin/bash\\nsystemd-timesync', 'x', '497', '497') >>> with open('/etc/passwd', 'rt') as f: # doctest: +SKIP ... print(tuple(itemize(iter(f.read, ''), sep=':', keep_ends=True))[0:10]) # doctest: +SKIP ('root:', 'x:', '0:', '0:', 'root:', '/root:', '/bin/bash\\nsystemd-timesync:', 'x:', '497:', '497:') """ if sep is None: yield from _split_lines(iterable, keep_ends=keep_ends) else: yield from _split_items_with_separator( iterable, sep=sep, keep_ends=keep_ends, )
def _split_items_with_separator(iterable: Iterable[T], sep: T, keep_ends: bool = False, ) -> Generator[T, None, None]: assembled = None for chunk in iterable: if not assembled: assembled = chunk else: assembled += chunk items = assembled.split(sep=sep) if len(items) == 1: continue if assembled.endswith(sep): assembled = None else: assembled = items[-1] items.pop(-1) if keep_ends: for item in items: yield item + sep else: yield from items if assembled: yield assembled def _split_lines(iterable: Iterable[T], keep_ends: bool = False, ) -> Generator[T, None, None]: assembled = None for chunk in iterable: if not assembled: assembled = chunk else: assembled += chunk # We don't know all elements on which python splits lines, therefore we # split once with ends and once without ends. Lines that differ have no # ending lines_with_end = assembled.splitlines(keepends=True) lines_without_end = assembled.splitlines(keepends=False) if lines_with_end[-1] == lines_without_end[-1]: assembled = lines_with_end[-1] lines_with_end.pop(-1) lines_without_end.pop(-1) else: assembled = None if keep_ends: yield from lines_with_end else: yield from lines_without_end if assembled: yield assembled