pyo3/types/
string.rs

1#[cfg(not(Py_LIMITED_API))]
2use crate::exceptions::PyUnicodeDecodeError;
3use crate::ffi_ptr_ext::FfiPtrExt;
4use crate::instance::Borrowed;
5use crate::py_result_ext::PyResultExt;
6use crate::types::any::PyAnyMethods;
7use crate::types::bytes::PyBytesMethods;
8use crate::types::PyBytes;
9use crate::{ffi, Bound, Py, PyAny, PyResult, Python};
10use std::borrow::Cow;
11use std::ffi::CString;
12use std::str;
13
14/// Represents raw data backing a Python `str`.
15///
16/// Python internally stores strings in various representations. This enumeration
17/// represents those variations.
18#[cfg(not(Py_LIMITED_API))]
19#[derive(Clone, Copy, Debug, PartialEq, Eq)]
20pub enum PyStringData<'a> {
21    /// UCS1 representation.
22    Ucs1(&'a [u8]),
23
24    /// UCS2 representation.
25    Ucs2(&'a [u16]),
26
27    /// UCS4 representation.
28    Ucs4(&'a [u32]),
29}
30
31#[cfg(not(Py_LIMITED_API))]
32impl<'a> PyStringData<'a> {
33    /// Obtain the raw bytes backing this instance as a [u8] slice.
34    pub fn as_bytes(&self) -> &[u8] {
35        match self {
36            Self::Ucs1(s) => s,
37            Self::Ucs2(s) => unsafe {
38                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
39            },
40            Self::Ucs4(s) => unsafe {
41                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
42            },
43        }
44    }
45
46    /// Size in bytes of each value/item in the underlying slice.
47    #[inline]
48    pub fn value_width_bytes(&self) -> usize {
49        match self {
50            Self::Ucs1(_) => 1,
51            Self::Ucs2(_) => 2,
52            Self::Ucs4(_) => 4,
53        }
54    }
55
56    /// Convert the raw data to a Rust string.
57    ///
58    /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
59    /// returns an owned string.
60    ///
61    /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
62    /// storage format. This should only occur for strings that were created via Python
63    /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
64    /// never occur for strings that were created from Python code.
65    pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
66        use std::ffi::CStr;
67        match self {
68            Self::Ucs1(data) => match str::from_utf8(data) {
69                Ok(s) => Ok(Cow::Borrowed(s)),
70                Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()),
71            },
72            Self::Ucs2(data) => match String::from_utf16(data) {
73                Ok(s) => Ok(Cow::Owned(s)),
74                Err(e) => {
75                    let mut message = e.to_string().as_bytes().to_vec();
76                    message.push(0);
77
78                    Err(PyUnicodeDecodeError::new(
79                        py,
80                        ffi::c_str!("utf-16"),
81                        self.as_bytes(),
82                        0..self.as_bytes().len(),
83                        CStr::from_bytes_with_nul(&message).unwrap(),
84                    )?
85                    .into())
86                }
87            },
88            Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
89                Some(s) => Ok(Cow::Owned(s)),
90                None => Err(PyUnicodeDecodeError::new(
91                    py,
92                    ffi::c_str!("utf-32"),
93                    self.as_bytes(),
94                    0..self.as_bytes().len(),
95                    ffi::c_str!("error converting utf-32"),
96                )?
97                .into()),
98            },
99        }
100    }
101
102    /// Convert the raw data to a Rust string, possibly with data loss.
103    ///
104    /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
105    ///
106    /// Returns a borrow into original data, when possible, or owned data otherwise.
107    ///
108    /// The return value of this function should only disagree with [Self::to_string]
109    /// when that method would error.
110    pub fn to_string_lossy(self) -> Cow<'a, str> {
111        match self {
112            Self::Ucs1(data) => String::from_utf8_lossy(data),
113            Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
114            Self::Ucs4(data) => Cow::Owned(
115                data.iter()
116                    .map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
117                    .collect(),
118            ),
119        }
120    }
121}
122
123/// Represents a Python `string` (a Unicode string object).
124///
125/// Values of this type are accessed via PyO3's smart pointers, e.g. as
126/// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound].
127///
128/// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for
129/// [`Bound<'py, PyString>`][Bound].
130///
131/// # Equality
132///
133/// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the
134/// data in the Python string to a Rust UTF-8 string slice.
135///
136/// This is not always the most appropriate way to compare Python strings, as Python string subclasses
137/// may have different equality semantics. In situations where subclasses overriding equality might be
138/// relevant, use [`PyAnyMethods::eq`], at cost of the additional overhead of a Python method call.
139///
140/// ```rust
141/// # use pyo3::prelude::*;
142/// use pyo3::types::PyString;
143///
144/// # Python::with_gil(|py| {
145/// let py_string = PyString::new(py, "foo");
146/// // via PartialEq<str>
147/// assert_eq!(py_string, "foo");
148///
149/// // via Python equality
150/// assert!(py_string.as_any().eq("foo").unwrap());
151/// # });
152/// ```
153#[repr(transparent)]
154pub struct PyString(PyAny);
155
156pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check);
157
158impl PyString {
159    /// Creates a new Python string object.
160    ///
161    /// Panics if out of memory.
162    pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
163        let ptr = s.as_ptr().cast();
164        let len = s.len() as ffi::Py_ssize_t;
165        unsafe {
166            ffi::PyUnicode_FromStringAndSize(ptr, len)
167                .assume_owned(py)
168                .downcast_into_unchecked()
169        }
170    }
171
172    /// Intern the given string
173    ///
174    /// This will return a reference to the same Python string object if called repeatedly with the same string.
175    ///
176    /// Note that while this is more memory efficient than [`PyString::new`], it unconditionally allocates a
177    /// temporary Python string object and is thereby slower than [`PyString::new`].
178    ///
179    /// Panics if out of memory.
180    pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
181        let ptr = s.as_ptr().cast();
182        let len = s.len() as ffi::Py_ssize_t;
183        unsafe {
184            let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
185            if !ob.is_null() {
186                ffi::PyUnicode_InternInPlace(&mut ob);
187            }
188            ob.assume_owned(py).downcast_into_unchecked()
189        }
190    }
191
192    /// Attempts to create a Python string from a Python [bytes-like object].
193    ///
194    /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
195    pub fn from_object<'py>(
196        src: &Bound<'py, PyAny>,
197        encoding: &str,
198        errors: &str,
199    ) -> PyResult<Bound<'py, PyString>> {
200        let encoding = CString::new(encoding)?;
201        let errors = CString::new(errors)?;
202        unsafe {
203            ffi::PyUnicode_FromEncodedObject(
204                src.as_ptr(),
205                encoding.as_ptr().cast(),
206                errors.as_ptr().cast(),
207            )
208            .assume_owned_or_err(src.py())
209            .downcast_into_unchecked()
210        }
211    }
212}
213
214/// Implementation of functionality for [`PyString`].
215///
216/// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call
217/// syntax these methods are separated into a trait, because stable Rust does not yet support
218/// `arbitrary_self_types`.
219#[doc(alias = "PyString")]
220pub trait PyStringMethods<'py>: crate::sealed::Sealed {
221    /// Gets the Python string as a Rust UTF-8 string slice.
222    ///
223    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
224    /// (containing unpaired surrogates).
225    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
226    fn to_str(&self) -> PyResult<&str>;
227
228    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
229    ///
230    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
231    /// (containing unpaired surrogates).
232    fn to_cow(&self) -> PyResult<Cow<'_, str>>;
233
234    /// Converts the `PyString` into a Rust string.
235    ///
236    /// Unpaired surrogates invalid UTF-8 sequences are
237    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
238    fn to_string_lossy(&self) -> Cow<'_, str>;
239
240    /// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
241    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
242
243    /// Obtains the raw data backing the Python string.
244    ///
245    /// If the Python string object was created through legacy APIs, its internal storage format
246    /// will be canonicalized before data is returned.
247    ///
248    /// # Safety
249    ///
250    /// This function implementation relies on manually decoding a C bitfield. In practice, this
251    /// works well on common little-endian architectures such as x86_64, where the bitfield has a
252    /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
253    /// x86_64 platforms.
254    ///
255    /// By using this API, you accept responsibility for testing that PyStringData behaves as
256    /// expected on the targets where you plan to distribute your software.
257    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
258    unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
259}
260
261impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
262    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
263    fn to_str(&self) -> PyResult<&str> {
264        self.as_borrowed().to_str()
265    }
266
267    fn to_cow(&self) -> PyResult<Cow<'_, str>> {
268        self.as_borrowed().to_cow()
269    }
270
271    fn to_string_lossy(&self) -> Cow<'_, str> {
272        self.as_borrowed().to_string_lossy()
273    }
274
275    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
276        unsafe {
277            ffi::PyUnicode_AsUTF8String(self.as_ptr())
278                .assume_owned_or_err(self.py())
279                .downcast_into_unchecked::<PyBytes>()
280        }
281    }
282
283    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
284    unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
285        unsafe { self.as_borrowed().data() }
286    }
287}
288
289impl<'a> Borrowed<'a, '_, PyString> {
290    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
291    #[allow(clippy::wrong_self_convention)]
292    pub(crate) fn to_str(self) -> PyResult<&'a str> {
293        // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
294        let mut size: ffi::Py_ssize_t = 0;
295        let data: *const u8 =
296            unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
297        if data.is_null() {
298            Err(crate::PyErr::fetch(self.py()))
299        } else {
300            Ok(unsafe {
301                std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
302            })
303        }
304    }
305
306    #[allow(clippy::wrong_self_convention)]
307    pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> {
308        // TODO: this method can probably be deprecated once Python 3.9 support is dropped,
309        // because all versions then support the more efficient `to_str`.
310        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
311        {
312            self.to_str().map(Cow::Borrowed)
313        }
314
315        #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
316        {
317            let bytes = self.encode_utf8()?;
318            Ok(Cow::Owned(
319                unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
320            ))
321        }
322    }
323
324    #[allow(clippy::wrong_self_convention)]
325    fn to_string_lossy(self) -> Cow<'a, str> {
326        let ptr = self.as_ptr();
327        let py = self.py();
328
329        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
330        if let Ok(s) = self.to_str() {
331            return Cow::Borrowed(s);
332        }
333
334        let bytes = unsafe {
335            ffi::PyUnicode_AsEncodedString(
336                ptr,
337                ffi::c_str!("utf-8").as_ptr(),
338                ffi::c_str!("surrogatepass").as_ptr(),
339            )
340            .assume_owned(py)
341            .downcast_into_unchecked::<PyBytes>()
342        };
343        Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
344    }
345
346    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
347    unsafe fn data(self) -> PyResult<PyStringData<'a>> {
348        unsafe {
349            let ptr = self.as_ptr();
350
351            #[cfg(not(Py_3_12))]
352            #[allow(deprecated)]
353            {
354                let ready = ffi::PyUnicode_READY(ptr);
355                if ready != 0 {
356                    // Exception was created on failure.
357                    return Err(crate::PyErr::fetch(self.py()));
358                }
359            }
360
361            // The string should be in its canonical form after calling `PyUnicode_READY()`.
362            // And non-canonical form not possible after Python 3.12. So it should be safe
363            // to call these APIs.
364            let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
365            let raw_data = ffi::PyUnicode_DATA(ptr);
366            let kind = ffi::PyUnicode_KIND(ptr);
367
368            match kind {
369                ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
370                    raw_data as *const u8,
371                    length,
372                ))),
373                ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
374                    raw_data as *const u16,
375                    length,
376                ))),
377                ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
378                    raw_data as *const u32,
379                    length,
380                ))),
381                _ => unreachable!(),
382            }
383        }
384    }
385}
386
387impl Py<PyString> {
388    /// Gets the Python string as a Rust UTF-8 string slice.
389    ///
390    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
391    /// (containing unpaired surrogates).
392    ///
393    /// Because `str` objects are immutable, the returned slice is independent of
394    /// the GIL lifetime.
395    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
396    pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
397        self.bind_borrowed(py).to_str()
398    }
399
400    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
401    ///
402    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
403    /// (containing unpaired surrogates).
404    ///
405    /// Because `str` objects are immutable, the returned slice is independent of
406    /// the GIL lifetime.
407    pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
408        self.bind_borrowed(py).to_cow()
409    }
410
411    /// Converts the `PyString` into a Rust string.
412    ///
413    /// Unpaired surrogates invalid UTF-8 sequences are
414    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
415    ///
416    /// Because `str` objects are immutable, the returned slice is independent of
417    /// the GIL lifetime.
418    pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
419        self.bind_borrowed(py).to_string_lossy()
420    }
421}
422
423/// Compares whether the data in the Python string is equal to the given UTF8.
424///
425/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
426impl PartialEq<str> for Bound<'_, PyString> {
427    #[inline]
428    fn eq(&self, other: &str) -> bool {
429        self.as_borrowed() == *other
430    }
431}
432
433/// Compares whether the data in the Python string is equal to the given UTF8.
434///
435/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
436impl PartialEq<&'_ str> for Bound<'_, PyString> {
437    #[inline]
438    fn eq(&self, other: &&str) -> bool {
439        self.as_borrowed() == **other
440    }
441}
442
443/// Compares whether the data in the Python string is equal to the given UTF8.
444///
445/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
446impl PartialEq<Bound<'_, PyString>> for str {
447    #[inline]
448    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
449        *self == other.as_borrowed()
450    }
451}
452
453/// Compares whether the data in the Python string is equal to the given UTF8.
454///
455/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
456impl PartialEq<&'_ Bound<'_, PyString>> for str {
457    #[inline]
458    fn eq(&self, other: &&Bound<'_, PyString>) -> bool {
459        *self == other.as_borrowed()
460    }
461}
462
463/// Compares whether the data in the Python string is equal to the given UTF8.
464///
465/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
466impl PartialEq<Bound<'_, PyString>> for &'_ str {
467    #[inline]
468    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
469        **self == other.as_borrowed()
470    }
471}
472
473/// Compares whether the data in the Python string is equal to the given UTF8.
474///
475/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
476impl PartialEq<str> for &'_ Bound<'_, PyString> {
477    #[inline]
478    fn eq(&self, other: &str) -> bool {
479        self.as_borrowed() == other
480    }
481}
482
483/// Compares whether the data in the Python string is equal to the given UTF8.
484///
485/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
486impl PartialEq<str> for Borrowed<'_, '_, PyString> {
487    #[inline]
488    fn eq(&self, other: &str) -> bool {
489        #[cfg(not(Py_3_13))]
490        {
491            self.to_cow().map_or(false, |s| s == other)
492        }
493
494        #[cfg(Py_3_13)]
495        unsafe {
496            ffi::PyUnicode_EqualToUTF8AndSize(
497                self.as_ptr(),
498                other.as_ptr().cast(),
499                other.len() as _,
500            ) == 1
501        }
502    }
503}
504
505/// Compares whether the data in the Python string is equal to the given UTF8.
506///
507/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
508impl PartialEq<&str> for Borrowed<'_, '_, PyString> {
509    #[inline]
510    fn eq(&self, other: &&str) -> bool {
511        *self == **other
512    }
513}
514
515/// Compares whether the data in the Python string is equal to the given UTF8.
516///
517/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
518impl PartialEq<Borrowed<'_, '_, PyString>> for str {
519    #[inline]
520    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
521        other == self
522    }
523}
524
525/// Compares whether the data in the Python string is equal to the given UTF8.
526///
527/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
528impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str {
529    #[inline]
530    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
531        other == self
532    }
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538    use crate::{IntoPyObject, PyObject};
539
540    #[test]
541    fn test_to_cow_utf8() {
542        Python::with_gil(|py| {
543            let s = "ascii 🐈";
544            let py_string = PyString::new(py, s);
545            assert_eq!(s, py_string.to_cow().unwrap());
546        })
547    }
548
549    #[test]
550    fn test_to_cow_surrogate() {
551        Python::with_gil(|py| {
552            let py_string = py
553                .eval(ffi::c_str!(r"'\ud800'"), None, None)
554                .unwrap()
555                .downcast_into::<PyString>()
556                .unwrap();
557            assert!(py_string.to_cow().is_err());
558        })
559    }
560
561    #[test]
562    fn test_to_cow_unicode() {
563        Python::with_gil(|py| {
564            let s = "哈哈🐈";
565            let py_string = PyString::new(py, s);
566            assert_eq!(s, py_string.to_cow().unwrap());
567        })
568    }
569
570    #[test]
571    fn test_encode_utf8_unicode() {
572        Python::with_gil(|py| {
573            let s = "哈哈🐈";
574            let obj = PyString::new(py, s);
575            assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
576        })
577    }
578
579    #[test]
580    fn test_encode_utf8_surrogate() {
581        Python::with_gil(|py| {
582            let obj: PyObject = py
583                .eval(ffi::c_str!(r"'\ud800'"), None, None)
584                .unwrap()
585                .into();
586            assert!(obj
587                .bind(py)
588                .downcast::<PyString>()
589                .unwrap()
590                .encode_utf8()
591                .is_err());
592        })
593    }
594
595    #[test]
596    fn test_to_string_lossy() {
597        Python::with_gil(|py| {
598            let py_string = py
599                .eval(ffi::c_str!(r"'🐈 Hello \ud800World'"), None, None)
600                .unwrap()
601                .downcast_into::<PyString>()
602                .unwrap();
603
604            assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
605        })
606    }
607
608    #[test]
609    fn test_debug_string() {
610        Python::with_gil(|py| {
611            let s = "Hello\n".into_pyobject(py).unwrap();
612            assert_eq!(format!("{:?}", s), "'Hello\\n'");
613        })
614    }
615
616    #[test]
617    fn test_display_string() {
618        Python::with_gil(|py| {
619            let s = "Hello\n".into_pyobject(py).unwrap();
620            assert_eq!(format!("{}", s), "Hello\n");
621        })
622    }
623
624    #[test]
625    fn test_string_from_object() {
626        Python::with_gil(|py| {
627            let py_bytes = PyBytes::new(py, b"ab\xFFcd");
628
629            let py_string = PyString::from_object(&py_bytes, "utf-8", "ignore").unwrap();
630
631            let result = py_string.to_cow().unwrap();
632            assert_eq!(result, "abcd");
633        });
634    }
635
636    #[test]
637    fn test_string_from_obect_with_invalid_encoding_errors() {
638        Python::with_gil(|py| {
639            let py_bytes = PyBytes::new(py, b"abcd");
640
641            let result = PyString::from_object(&py_bytes, "utf\0-8", "ignore");
642            assert!(result.is_err());
643
644            let result = PyString::from_object(&py_bytes, "utf-8", "ign\0ore");
645            assert!(result.is_err());
646        });
647    }
648
649    #[test]
650    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
651    fn test_string_data_ucs1() {
652        Python::with_gil(|py| {
653            let s = PyString::new(py, "hello, world");
654            let data = unsafe { s.data().unwrap() };
655
656            assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
657            assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
658            assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
659        })
660    }
661
662    #[test]
663    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
664    fn test_string_data_ucs1_invalid() {
665        Python::with_gil(|py| {
666            // 0xfe is not allowed in UTF-8.
667            let buffer = b"f\xfe\0";
668            let ptr = unsafe {
669                crate::ffi::PyUnicode_FromKindAndData(
670                    crate::ffi::PyUnicode_1BYTE_KIND as _,
671                    buffer.as_ptr().cast(),
672                    2,
673                )
674            };
675            assert!(!ptr.is_null());
676            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
677            let data = unsafe { s.data().unwrap() };
678            assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
679            let err = data.to_string(py).unwrap_err();
680            assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
681            assert!(err
682                .to_string()
683                .contains("'utf-8' codec can't decode byte 0xfe in position 1"));
684            assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
685        });
686    }
687
688    #[test]
689    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
690    fn test_string_data_ucs2() {
691        Python::with_gil(|py| {
692            let s = py.eval(ffi::c_str!("'foo\\ud800'"), None, None).unwrap();
693            let py_string = s.downcast::<PyString>().unwrap();
694            let data = unsafe { py_string.data().unwrap() };
695
696            assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
697            assert_eq!(
698                data.to_string_lossy(),
699                Cow::Owned::<str>("foo�".to_string())
700            );
701        })
702    }
703
704    #[test]
705    #[cfg(all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little"))]
706    fn test_string_data_ucs2_invalid() {
707        Python::with_gil(|py| {
708            // U+FF22 (valid) & U+d800 (never valid)
709            let buffer = b"\x22\xff\x00\xd8\x00\x00";
710            let ptr = unsafe {
711                crate::ffi::PyUnicode_FromKindAndData(
712                    crate::ffi::PyUnicode_2BYTE_KIND as _,
713                    buffer.as_ptr().cast(),
714                    2,
715                )
716            };
717            assert!(!ptr.is_null());
718            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
719            let data = unsafe { s.data().unwrap() };
720            assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
721            let err = data.to_string(py).unwrap_err();
722            assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
723            assert!(err
724                .to_string()
725                .contains("'utf-16' codec can't decode bytes in position 0-3"));
726            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
727        });
728    }
729
730    #[test]
731    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
732    fn test_string_data_ucs4() {
733        Python::with_gil(|py| {
734            let s = "哈哈🐈";
735            let py_string = PyString::new(py, s);
736            let data = unsafe { py_string.data().unwrap() };
737
738            assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
739            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
740        })
741    }
742
743    #[test]
744    #[cfg(all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little"))]
745    fn test_string_data_ucs4_invalid() {
746        Python::with_gil(|py| {
747            // U+20000 (valid) & U+d800 (never valid)
748            let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
749            let ptr = unsafe {
750                crate::ffi::PyUnicode_FromKindAndData(
751                    crate::ffi::PyUnicode_4BYTE_KIND as _,
752                    buffer.as_ptr().cast(),
753                    2,
754                )
755            };
756            assert!(!ptr.is_null());
757            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
758            let data = unsafe { s.data().unwrap() };
759            assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
760            let err = data.to_string(py).unwrap_err();
761            assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
762            assert!(err
763                .to_string()
764                .contains("'utf-32' codec can't decode bytes in position 0-7"));
765            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
766        });
767    }
768
769    #[test]
770    fn test_intern_string() {
771        Python::with_gil(|py| {
772            let py_string1 = PyString::intern(py, "foo");
773            assert_eq!(py_string1, "foo");
774
775            let py_string2 = PyString::intern(py, "foo");
776            assert_eq!(py_string2, "foo");
777
778            assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
779
780            let py_string3 = PyString::intern(py, "bar");
781            assert_eq!(py_string3, "bar");
782
783            assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
784        });
785    }
786
787    #[test]
788    fn test_py_to_str_utf8() {
789        Python::with_gil(|py| {
790            let s = "ascii 🐈";
791            let py_string = PyString::new(py, s).unbind();
792
793            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
794            assert_eq!(s, py_string.to_str(py).unwrap());
795
796            assert_eq!(s, py_string.to_cow(py).unwrap());
797        })
798    }
799
800    #[test]
801    fn test_py_to_str_surrogate() {
802        Python::with_gil(|py| {
803            let py_string: Py<PyString> = py
804                .eval(ffi::c_str!(r"'\ud800'"), None, None)
805                .unwrap()
806                .extract()
807                .unwrap();
808
809            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
810            assert!(py_string.to_str(py).is_err());
811
812            assert!(py_string.to_cow(py).is_err());
813        })
814    }
815
816    #[test]
817    fn test_py_to_string_lossy() {
818        Python::with_gil(|py| {
819            let py_string: Py<PyString> = py
820                .eval(ffi::c_str!(r"'🐈 Hello \ud800World'"), None, None)
821                .unwrap()
822                .extract()
823                .unwrap();
824            assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
825        })
826    }
827
828    #[test]
829    fn test_comparisons() {
830        Python::with_gil(|py| {
831            let s = "hello, world";
832            let py_string = PyString::new(py, s);
833
834            assert_eq!(py_string, "hello, world");
835
836            assert_eq!(py_string, s);
837            assert_eq!(&py_string, s);
838            assert_eq!(s, py_string);
839            assert_eq!(s, &py_string);
840
841            assert_eq!(py_string, *s);
842            assert_eq!(&py_string, *s);
843            assert_eq!(*s, py_string);
844            assert_eq!(*s, &py_string);
845
846            let py_string = py_string.as_borrowed();
847
848            assert_eq!(py_string, s);
849            assert_eq!(&py_string, s);
850            assert_eq!(s, py_string);
851            assert_eq!(s, &py_string);
852
853            assert_eq!(py_string, *s);
854            assert_eq!(*s, py_string);
855        })
856    }
857}
⚠️ Internal Docs ⚠️ Not Public API 👉 Official Docs Here