PyArrow Functionality

pandas can utilize PyArrow to extend functionality and improve the performance of various APIs. This includes:

  • More extensive data types compared to NumPy

  • Missing data support (NA) for all data types

  • Performant IO reader integration

  • Facilitate interoperability with other dataframe libraries based on the Apache Arrow specification (e.g. polars, cuDF)

To use this functionality, please ensure you have installed the minimum supported PyArrow version.

Data Structure Integration

A Series, Index, or the columns of a DataFrame can be directly backed by a which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by [pyarrow], e.g. "int64[pyarrow]"" into the dtype parameter

In [1]: ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:431, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    428     index = ensure_index(index)
    430 if dtype is not None:
--> 431     dtype = self._validate_dtype(dtype)
    433 if data is None:
    434     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:492, in NDFrame._validate_dtype(cls, dtype)
    490 """validate the passed dtype"""
    491 if dtype is not None:
--> 492     dtype = pandas_dtype(dtype)
    494     # a compound dtype
    495     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1615, in pandas_dtype(dtype)
   1612     return dtype
   1614 # registered extension types
-> 1615 result = registry.find(dtype)
   1616 if result is not None:
   1617     if isinstance(result, type):
   1618         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:537, in Registry.find(self, dtype)
    535 for dtype_type in self.dtypes:
    536     try:
--> 537         return dtype_type.construct_from_string(dtype)
    538     except TypeError:
    539         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2203, in ArrowDtype.construct_from_string(cls, string)
   2201 base_type = string[:-9]  # get rid of "[pyarrow]"
   2202 try:
-> 2203     pa_dtype = pa.type_for_alias(base_type)
   2204 except ValueError as err:
   2205     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [2]: ser
Out[2]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [3]: idx = pd.Index([True, None], dtype="bool[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 idx = pd.Index([True, None], dtype="bool[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/indexes/base.py:490, in Index.__new__(cls, data, dtype, copy, name, tupleize_cols)
    487 name = maybe_extract_name(name, data, cls)
    489 if dtype is not None:
--> 490     dtype = pandas_dtype(dtype)
    492 data_dtype = getattr(data, "dtype", None)
    494 refs = None

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1615, in pandas_dtype(dtype)
   1612     return dtype
   1614 # registered extension types
-> 1615 result = registry.find(dtype)
   1616 if result is not None:
   1617     if isinstance(result, type):
   1618         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:537, in Registry.find(self, dtype)
    535 for dtype_type in self.dtypes:
    536     try:
--> 537         return dtype_type.construct_from_string(dtype)
    538     except TypeError:
    539         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2203, in ArrowDtype.construct_from_string(cls, string)
   2201 base_type = string[:-9]  # get rid of "[pyarrow]"
   2202 try:
-> 2203     pa_dtype = pa.type_for_alias(base_type)
   2204 except ValueError as err:
   2205     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [4]: idx
Out[4]: [0.0, 1.0, 10.0]

In [5]: df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/frame.py:674, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    665 def __init__(
    666     self,
    667     data=None,
   (...)
    671     copy: bool | None = None,
    672 ) -> None:
    673     if dtype is not None:
--> 674         dtype = self._validate_dtype(dtype)
    676     if isinstance(data, DataFrame):
    677         data = data._mgr

File /usr/lib/python3/dist-packages/pandas/core/generic.py:492, in NDFrame._validate_dtype(cls, dtype)
    490 """validate the passed dtype"""
    491 if dtype is not None:
--> 492     dtype = pandas_dtype(dtype)
    494     # a compound dtype
    495     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1615, in pandas_dtype(dtype)
   1612     return dtype
   1614 # registered extension types
-> 1615 result = registry.find(dtype)
   1616 if result is not None:
   1617     if isinstance(result, type):
   1618         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:537, in Registry.find(self, dtype)
    535 for dtype_type in self.dtypes:
    536     try:
--> 537         return dtype_type.construct_from_string(dtype)
    538     except TypeError:
    539         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2203, in ArrowDtype.construct_from_string(cls, string)
   2201 base_type = string[:-9]  # get rid of "[pyarrow]"
   2202 try:
-> 2203     pa_dtype = pa.type_for_alias(base_type)
   2204 except ValueError as err:
   2205     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [6]: df
Out[6]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

Note

The string alias "string[pyarrow]" maps to pd.StringDtype("pyarrow") which is not equivalent to specifying dtype=pd.ArrowDtype(pa.string()). Generally, operations on the data will behave similarly except pd.StringDtype("pyarrow") can return NumPy-backed nullable types while pd.ArrowDtype(pa.string()) will return ArrowDtype.

In [7]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[7], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [8]: data = list("abc")

In [9]: ser_sd = pd.Series(data, dtype="string[pyarrow]")
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[9], line 1
----> 1 ser_sd = pd.Series(data, dtype="string[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:431, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    428     index = ensure_index(index)
    430 if dtype is not None:
--> 431     dtype = self._validate_dtype(dtype)
    433 if data is None:
    434     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:492, in NDFrame._validate_dtype(cls, dtype)
    490 """validate the passed dtype"""
    491 if dtype is not None:
--> 492     dtype = pandas_dtype(dtype)
    494     # a compound dtype
    495     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1615, in pandas_dtype(dtype)
   1612     return dtype
   1614 # registered extension types
-> 1615 result = registry.find(dtype)
   1616 if result is not None:
   1617     if isinstance(result, type):
   1618         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:537, in Registry.find(self, dtype)
    535 for dtype_type in self.dtypes:
    536     try:
--> 537         return dtype_type.construct_from_string(dtype)
    538     except TypeError:
    539         pass

File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:172, in StringDtype.construct_from_string(cls, string)
    170     return cls(storage="python")
    171 elif string == "string[pyarrow]":
--> 172     return cls(storage="pyarrow")
    173 elif string == "string[pyarrow_numpy]":
    174     return cls(storage="pyarrow_numpy")

File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:126, in StringDtype.__init__(self, storage)
    121     raise ValueError(
    122         f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
    123         f"Got {storage} instead."
    124     )
    125 if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0:
--> 126     raise ImportError(
    127         "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
    128     )
    129 self.storage = storage

ImportError: pyarrow>=7.0.0 is required for PyArrow backed StringArray.

In [10]: ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))

NameError: name 'pa' is not defined

In [11]: ser_ad.dtype == ser_sd.dtype
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 ser_ad.dtype == ser_sd.dtype

NameError: name 'ser_ad' is not defined

In [12]: ser_sd.str.contains("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 1
----> 1 ser_sd.str.contains("a")

NameError: name 'ser_sd' is not defined

In [13]: ser_ad.str.contains("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 1
----> 1 ser_ad.str.contains("a")

NameError: name 'ser_ad' is not defined

For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters into ArrowDtype to use in the dtype parameter.

In [14]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[14], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [15]: list_str_type = pa.list_(pa.string())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 1
----> 1 list_str_type = pa.list_(pa.string())

NameError: name 'pa' is not defined

In [16]: ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 1
----> 1 ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))

NameError: name 'list_str_type' is not defined

In [17]: ser
Out[17]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64
In [18]: from datetime import time

In [19]: idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))

NameError: name 'pa' is not defined

In [20]: idx
Out[20]: [0.0, 1.0, 10.0]
In [21]: from decimal import Decimal

In [22]: decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))

NameError: name 'pa' is not defined

In [23]: data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]

In [24]: df = pd.DataFrame(data, dtype=decimal_type)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 1
----> 1 df = pd.DataFrame(data, dtype=decimal_type)

NameError: name 'decimal_type' is not defined

In [25]: df
Out[25]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

If you already have an or , you can pass it into arrays.ArrowExtensionArray to construct the associated Series, Index or DataFrame object.

In [26]: pa_array = pa.array(
   ....:     [{"1": "2"}, {"10": "20"}, None],
   ....:     type=pa.map_(pa.string(), pa.string()),
   ....: )
   ....: 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 1
----> 1 pa_array = pa.array(
      2     [{"1": "2"}, {"10": "20"}, None],
      3     type=pa.map_(pa.string(), pa.string()),
      4 )

NameError: name 'pa' is not defined

In [27]: ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 1
----> 1 ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))

NameError: name 'pa_array' is not defined

In [28]: ser
Out[28]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

To retrieve a pyarrow from a Series or Index, you can call the pyarrow array constructor on the Series or Index.

In [29]: ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:431, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    428     index = ensure_index(index)
    430 if dtype is not None:
--> 431     dtype = self._validate_dtype(dtype)
    433 if data is None:
    434     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:492, in NDFrame._validate_dtype(cls, dtype)
    490 """validate the passed dtype"""
    491 if dtype is not None:
--> 492     dtype = pandas_dtype(dtype)
    494     # a compound dtype
    495     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1615, in pandas_dtype(dtype)
   1612     return dtype
   1614 # registered extension types
-> 1615 result = registry.find(dtype)
   1616 if result is not None:
   1617     if isinstance(result, type):
   1618         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:537, in Registry.find(self, dtype)
    535 for dtype_type in self.dtypes:
    536     try:
--> 537         return dtype_type.construct_from_string(dtype)
    538     except TypeError:
    539         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2203, in ArrowDtype.construct_from_string(cls, string)
   2201 base_type = string[:-9]  # get rid of "[pyarrow]"
   2202 try:
-> 2203     pa_dtype = pa.type_for_alias(base_type)
   2204 except ValueError as err:
   2205     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [30]: pa.array(ser)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[30], line 1
----> 1 pa.array(ser)

NameError: name 'pa' is not defined

In [31]: idx = pd.Index(ser)

In [32]: pa.array(idx)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[32], line 1
----> 1 pa.array(idx)

NameError: name 'pa' is not defined

To convert a to a DataFrame, you can call the method with types_mapper=pd.ArrowDtype.

In [33]: table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[33], line 1
----> 1 table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])

NameError: name 'pa' is not defined

In [34]: df = table.to_pandas(types_mapper=pd.ArrowDtype)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[34], line 1
----> 1 df = table.to_pandas(types_mapper=pd.ArrowDtype)

NameError: name 'table' is not defined

In [35]: df
Out[35]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

In [36]: df.dtypes
Out[36]: 
a    object
b    object
dtype: object

Operations

PyArrow data structure integration is implemented through pandas’ ExtensionArray interface; therefore, supported functionality exists where this interface is integrated within the pandas API. Additionally, this functionality is accelerated with PyArrow compute functions where available. This includes:

  • Numeric aggregations

  • Numeric arithmetic

  • Numeric rounding

  • Logical and comparison functions

  • String functionality

  • Datetime functionality

The following are just some examples of operations that are accelerated by native PyArrow compute functions.

In [37]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[37], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [38]: ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[38], line 1
----> 1 ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:431, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    428     index = ensure_index(index)
    430 if dtype is not None:
--> 431     dtype = self._validate_dtype(dtype)
    433 if data is None:
    434     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:492, in NDFrame._validate_dtype(cls, dtype)
    490 """validate the passed dtype"""
    491 if dtype is not None:
--> 492     dtype = pandas_dtype(dtype)
    494     # a compound dtype
    495     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1615, in pandas_dtype(dtype)
   1612     return dtype
   1614 # registered extension types
-> 1615 result = registry.find(dtype)
   1616 if result is not None:
   1617     if isinstance(result, type):
   1618         # GH 31356, GH 54592

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:537, in Registry.find(self, dtype)
    535 for dtype_type in self.dtypes:
    536     try:
--> 537         return dtype_type.construct_from_string(dtype)
    538     except TypeError:
    539         pass

File /usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py:2203, in ArrowDtype.construct_from_string(cls, string)
   2201 base_type = string[:-9]  # get rid of "[pyarrow]"
   2202 try:
-> 2203     pa_dtype = pa.type_for_alias(base_type)
   2204 except ValueError as err:
   2205     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [39]: ser.mean()
Out[39]: 2.0

In [40]: ser + ser
Out[40]: 
0    0.0
1    2.0
2    4.0
3    6.0
4    8.0
dtype: float64

In [41]: ser > (ser + 1)
Out[41]: 
0    False
1    False
2    False
3    False
4    False
dtype: bool

In [42]: ser.dropna()
Out[42]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [43]: ser.isna()
Out[43]: 
0    False
1    False
2    False
3    False
4    False
dtype: bool

In [44]: ser.fillna(0)
Out[44]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64
In [45]: ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[45], line 1
----> 1 ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))

NameError: name 'pa' is not defined

In [46]: ser_str.str.startswith("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[46], line 1
----> 1 ser_str.str.startswith("a")

NameError: name 'ser_str' is not defined
In [47]: from datetime import datetime

In [48]: pa_type = pd.ArrowDtype(pa.timestamp("ns"))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[48], line 1
----> 1 pa_type = pd.ArrowDtype(pa.timestamp("ns"))

NameError: name 'pa' is not defined

In [49]: ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[49], line 1
----> 1 ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)

NameError: name 'pa_type' is not defined

In [50]: ser_dt.dt.strftime("%Y-%m")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[50], line 1
----> 1 ser_dt.dt.strftime("%Y-%m")

NameError: name 'ser_dt' is not defined

I/O Reading

PyArrow also provides IO reading functionality that has been integrated into several pandas IO readers. The following functions provide an engine keyword that can dispatch to PyArrow to accelerate reading from an IO source.

In [51]: import io

In [52]: data = io.StringIO("""a,b,c
   ....:    1,2.5,True
   ....:    3,4.5,False
   ....: """)
   ....: 

In [53]: df = pd.read_csv(data, engine="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:137, in import_optional_dependency(name, extra, errors, min_version)
    136 try:
--> 137     module = importlib.import_module(name)
    138 except ImportError:

File /usr/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[53], line 1
----> 1 df = pd.read_csv(data, engine="pyarrow")

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    944     dtype_backend=dtype_backend,
    945 )
    946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:617, in _read(filepath_or_buffer, kwds)
    614     return parser
    616 with parser:
--> 617     return parser.read(nrows)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1736, in TextFileReader.read(self, nrows)
   1733 if self.engine == "pyarrow":
   1734     try:
   1735         # error: "ParserBase" has no attribute "read"
-> 1736         df = self._engine.read()  # type: ignore[attr-defined]
   1737     except Exception:
   1738         self.close()

File /usr/lib/python3/dist-packages/pandas/io/parsers/arrow_parser_wrapper.py:189, in ArrowParserWrapper.read(self)
    178 def read(self) -> DataFrame:
    179     """
    180     Reads the contents of a CSV file into a DataFrame and
    181     processes it according to the kwargs passed in the
   (...)
    187         The DataFrame created from the CSV file.
    188     """
--> 189     pa = import_optional_dependency("pyarrow")
    190     pyarrow_csv = import_optional_dependency("pyarrow.csv")
    191     self._get_pyarrow_options()

File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:140, in import_optional_dependency(name, extra, errors, min_version)
    138 except ImportError:
    139     if errors == "raise":
--> 140         raise ImportError(msg)
    141     return None
    143 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [54]: df
Out[54]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

By default, these functions and all other IO reader functions return NumPy-backed data. These readers can return PyArrow-backed data by specifying the parameter dtype_backend="pyarrow". A reader does not need to set engine="pyarrow" to necessarily return PyArrow-backed data.

In [55]: import io

In [56]: data = io.StringIO("""a,b,c,d,e,f,g,h,i
   ....:     1,2.5,True,a,,,,,
   ....:     3,4.5,False,b,6,7.5,True,a,
   ....: """)
   ....: 

In [57]: df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:137, in import_optional_dependency(name, extra, errors, min_version)
    136 try:
--> 137     module = importlib.import_module(name)
    138 except ImportError:

File /usr/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[57], line 1
----> 1 df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    944     dtype_backend=dtype_backend,
    945 )
    946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:611, in _read(filepath_or_buffer, kwds)
    608 _validate_names(kwds.get("names", None))
    610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
    613 if chunksize or iterator:
    614     return parser

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
   1445     self.options["has_index_names"] = kwds["has_index_names"]
   1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1723, in TextFileReader._make_engine(self, f, engine)
   1720     raise ValueError(msg)
   1722 try:
-> 1723     return mapping[engine](f, **self.options)
   1724 except Exception:
   1725     if self.handles is not None:

File /usr/lib/python3/dist-packages/pandas/io/parsers/c_parser_wrapper.py:92, in CParserWrapper.__init__(self, src, **kwds)
     89     kwds["dtype_backend"] = "numpy"
     90 if kwds["dtype_backend"] == "pyarrow":
     91     # Fail here loudly instead of in cython after reading
---> 92     import_optional_dependency("pyarrow")
     93 self._reader = parsers.TextReader(src, **kwds)
     95 self.unnamed_cols = self._reader.unnamed_cols

File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:140, in import_optional_dependency(name, extra, errors, min_version)
    138 except ImportError:
    139     if errors == "raise":
--> 140         raise ImportError(msg)
    141     return None
    143 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [58]: df_pyarrow.dtypes
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[58], line 1
----> 1 df_pyarrow.dtypes

NameError: name 'df_pyarrow' is not defined

Several non-IO reader functions can also use the dtype_backend argument to return PyArrow-backed data including: