pyo3_introspection/
introspection.rs

1use crate::model::{Argument, Arguments, Class, Const, Function, Module, VariableLengthArgument};
2use anyhow::{bail, ensure, Context, Result};
3use goblin::elf::Elf;
4use goblin::mach::load_command::CommandVariant;
5use goblin::mach::symbols::{NO_SECT, N_SECT};
6use goblin::mach::{Mach, MachO, SingleArch};
7use goblin::pe::PE;
8use goblin::Object;
9use serde::Deserialize;
10use std::cmp::Ordering;
11use std::collections::HashMap;
12use std::fs;
13use std::path::Path;
14
15/// Introspect a cdylib built with PyO3 and returns the definition of a Python module.
16///
17/// This function currently supports the ELF (most *nix including Linux), Match-O (macOS) and PE (Windows) formats.
18pub fn introspect_cdylib(library_path: impl AsRef<Path>, main_module_name: &str) -> Result<Module> {
19    let chunks = find_introspection_chunks_in_binary_object(library_path.as_ref())?;
20    parse_chunks(&chunks, main_module_name)
21}
22
23/// Parses the introspection chunks found in the binary
24fn parse_chunks(chunks: &[Chunk], main_module_name: &str) -> Result<Module> {
25    let mut chunks_by_id = HashMap::<&str, &Chunk>::new();
26    let mut chunks_by_parent = HashMap::<&str, Vec<&Chunk>>::new();
27    for chunk in chunks {
28        if let Some(id) = match chunk {
29            Chunk::Module { id, .. } => Some(id),
30            Chunk::Class { id, .. } => Some(id),
31            Chunk::Function { id, .. } => id.as_ref(),
32        } {
33            chunks_by_id.insert(id, chunk);
34        }
35        if let Some(parent) = match chunk {
36            Chunk::Module { .. } | Chunk::Class { .. } => None,
37            Chunk::Function { parent, .. } => parent.as_ref(),
38        } {
39            chunks_by_parent.entry(parent).or_default().push(chunk);
40        }
41    }
42    // We look for the root chunk
43    for chunk in chunks {
44        if let Chunk::Module {
45            name,
46            members,
47            consts,
48            id: _,
49        } = chunk
50        {
51            if name == main_module_name {
52                return convert_module(name, members, consts, &chunks_by_id, &chunks_by_parent);
53            }
54        }
55    }
56    bail!("No module named {main_module_name} found")
57}
58
59fn convert_module(
60    name: &str,
61    members: &[String],
62    consts: &[ConstChunk],
63    chunks_by_id: &HashMap<&str, &Chunk>,
64    chunks_by_parent: &HashMap<&str, Vec<&Chunk>>,
65) -> Result<Module> {
66    let (modules, classes, functions) = convert_members(
67        &members
68            .iter()
69            .filter_map(|id| chunks_by_id.get(id.as_str()).copied())
70            .collect::<Vec<_>>(),
71        chunks_by_id,
72        chunks_by_parent,
73    )?;
74
75    Ok(Module {
76        name: name.into(),
77        modules,
78        classes,
79        functions,
80        consts: consts
81            .iter()
82            .map(|c| Const {
83                name: c.name.clone(),
84                value: c.value.clone(),
85            })
86            .collect(),
87    })
88}
89
90/// Convert a list of members of a module or a class
91fn convert_members(
92    chunks: &[&Chunk],
93    chunks_by_id: &HashMap<&str, &Chunk>,
94    chunks_by_parent: &HashMap<&str, Vec<&Chunk>>,
95) -> Result<(Vec<Module>, Vec<Class>, Vec<Function>)> {
96    let mut modules = Vec::new();
97    let mut classes = Vec::new();
98    let mut functions = Vec::new();
99    for chunk in chunks {
100        match chunk {
101            Chunk::Module {
102                name,
103                members,
104                consts,
105                id: _,
106            } => {
107                modules.push(convert_module(
108                    name,
109                    members,
110                    consts,
111                    chunks_by_id,
112                    chunks_by_parent,
113                )?);
114            }
115            Chunk::Class { name, id } => {
116                classes.push(convert_class(id, name, chunks_by_id, chunks_by_parent)?)
117            }
118            Chunk::Function {
119                name,
120                id: _,
121                arguments,
122                parent: _,
123                decorators,
124                returns,
125            } => functions.push(convert_function(name, arguments, decorators, returns)),
126        }
127    }
128    Ok((modules, classes, functions))
129}
130
131fn convert_class(
132    id: &str,
133    name: &str,
134    chunks_by_id: &HashMap<&str, &Chunk>,
135    chunks_by_parent: &HashMap<&str, Vec<&Chunk>>,
136) -> Result<Class> {
137    let (nested_modules, nested_classes, mut methods) = convert_members(
138        chunks_by_parent
139            .get(&id)
140            .map(Vec::as_slice)
141            .unwrap_or_default(),
142        chunks_by_id,
143        chunks_by_parent,
144    )?;
145    ensure!(
146        nested_modules.is_empty(),
147        "Classes cannot contain nested modules"
148    );
149    ensure!(
150        nested_classes.is_empty(),
151        "Nested classes are not supported yet"
152    );
153    // We sort methods to get a stable output
154    methods.sort_by(|l, r| match l.name.cmp(&r.name) {
155        Ordering::Equal => {
156            // We put the getter before the setter
157            if l.decorators.iter().any(|d| d == "property") {
158                Ordering::Less
159            } else if r.decorators.iter().any(|d| d == "property") {
160                Ordering::Greater
161            } else {
162                // We pick an ordering based on decorators
163                l.decorators.cmp(&r.decorators)
164            }
165        }
166        o => o,
167    });
168    Ok(Class {
169        name: name.into(),
170        methods,
171    })
172}
173
174fn convert_function(
175    name: &str,
176    arguments: &ChunkArguments,
177    decorators: &[String],
178    returns: &Option<String>,
179) -> Function {
180    Function {
181        name: name.into(),
182        decorators: decorators.to_vec(),
183        arguments: Arguments {
184            positional_only_arguments: arguments.posonlyargs.iter().map(convert_argument).collect(),
185            arguments: arguments.args.iter().map(convert_argument).collect(),
186            vararg: arguments
187                .vararg
188                .as_ref()
189                .map(convert_variable_length_argument),
190            keyword_only_arguments: arguments.kwonlyargs.iter().map(convert_argument).collect(),
191            kwarg: arguments
192                .kwarg
193                .as_ref()
194                .map(convert_variable_length_argument),
195        },
196        returns: returns.clone(),
197    }
198}
199
200fn convert_argument(arg: &ChunkArgument) -> Argument {
201    Argument {
202        name: arg.name.clone(),
203        default_value: arg.default.clone(),
204        annotation: arg.annotation.clone(),
205    }
206}
207
208fn convert_variable_length_argument(arg: &ChunkArgument) -> VariableLengthArgument {
209    VariableLengthArgument {
210        name: arg.name.clone(),
211    }
212}
213
214fn find_introspection_chunks_in_binary_object(path: &Path) -> Result<Vec<Chunk>> {
215    let library_content =
216        fs::read(path).with_context(|| format!("Failed to read {}", path.display()))?;
217    match Object::parse(&library_content)
218        .context("The built library is not valid or not supported by our binary parser")?
219    {
220        Object::Elf(elf) => find_introspection_chunks_in_elf(&elf, &library_content),
221        Object::Mach(Mach::Binary(macho)) => {
222            find_introspection_chunks_in_macho(&macho, &library_content)
223        }
224        Object::Mach(Mach::Fat(multi_arch)) => {
225            for arch in &multi_arch {
226                match arch? {
227                    SingleArch::MachO(macho) => {
228                        return find_introspection_chunks_in_macho(&macho, &library_content)
229                    }
230                    SingleArch::Archive(_) => (),
231                }
232            }
233            bail!("No Mach-o chunk found in the multi-arch Mach-o container")
234        }
235        Object::PE(pe) => find_introspection_chunks_in_pe(&pe, &library_content),
236        _ => {
237            bail!("Only ELF, Mach-o and PE containers can be introspected")
238        }
239    }
240}
241
242fn find_introspection_chunks_in_elf(elf: &Elf<'_>, library_content: &[u8]) -> Result<Vec<Chunk>> {
243    let mut chunks = Vec::new();
244    for sym in &elf.syms {
245        if is_introspection_symbol(elf.strtab.get_at(sym.st_name).unwrap_or_default()) {
246            let section_header = &elf.section_headers[sym.st_shndx];
247            let data_offset = sym.st_value + section_header.sh_offset - section_header.sh_addr;
248            chunks.push(read_symbol_value_with_ptr_and_len(
249                &library_content[usize::try_from(data_offset).context("File offset overflow")?..],
250                0,
251                library_content,
252                elf.is_64,
253            )?);
254        }
255    }
256    Ok(chunks)
257}
258
259fn find_introspection_chunks_in_macho(
260    macho: &MachO<'_>,
261    library_content: &[u8],
262) -> Result<Vec<Chunk>> {
263    if !macho.little_endian {
264        bail!("Only little endian Mach-o binaries are supported");
265    }
266    ensure!(
267        !macho.load_commands.iter().any(|command| {
268            matches!(command.command, CommandVariant::DyldChainedFixups(_))
269        }),
270        "Mach-O binaries with fixup chains are not supported yet, to avoid using fixup chains, use `--codegen=link-arg=-no_fixup_chains` option."
271    );
272
273    let sections = macho
274        .segments
275        .sections()
276        .flatten()
277        .map(|t| t.map(|s| s.0))
278        .collect::<Result<Vec<_>, _>>()?;
279    let mut chunks = Vec::new();
280    for symbol in macho.symbols() {
281        let (name, nlist) = symbol?;
282        if nlist.is_global()
283            && nlist.get_type() == N_SECT
284            && nlist.n_sect != NO_SECT as usize
285            && is_introspection_symbol(name)
286        {
287            let section = &sections[nlist.n_sect - 1]; // Sections are counted from 1
288            let data_offset = nlist.n_value + u64::from(section.offset) - section.addr;
289            chunks.push(read_symbol_value_with_ptr_and_len(
290                &library_content[usize::try_from(data_offset).context("File offset overflow")?..],
291                0,
292                library_content,
293                macho.is_64,
294            )?);
295        }
296    }
297    Ok(chunks)
298}
299
300fn find_introspection_chunks_in_pe(pe: &PE<'_>, library_content: &[u8]) -> Result<Vec<Chunk>> {
301    let rdata_data_section = pe
302        .sections
303        .iter()
304        .find(|section| section.name().unwrap_or_default() == ".rdata")
305        .context("No .rdata section found")?;
306    let rdata_shift = pe.image_base
307        + usize::try_from(rdata_data_section.virtual_address)
308            .context(".rdata virtual_address overflow")?
309        - usize::try_from(rdata_data_section.pointer_to_raw_data)
310            .context(".rdata pointer_to_raw_data overflow")?;
311
312    let mut chunks = Vec::new();
313    for export in &pe.exports {
314        if is_introspection_symbol(export.name.unwrap_or_default()) {
315            chunks.push(read_symbol_value_with_ptr_and_len(
316                &library_content[export.offset.context("No symbol offset")?..],
317                rdata_shift,
318                library_content,
319                pe.is_64,
320            )?);
321        }
322    }
323    Ok(chunks)
324}
325
326fn read_symbol_value_with_ptr_and_len(
327    value_slice: &[u8],
328    shift: usize,
329    full_library_content: &[u8],
330    is_64: bool,
331) -> Result<Chunk> {
332    let (ptr, len) = if is_64 {
333        let (ptr, len) = value_slice[..16].split_at(8);
334        let ptr = usize::try_from(u64::from_le_bytes(
335            ptr.try_into().context("Too short symbol value")?,
336        ))
337        .context("Pointer overflow")?;
338        let len = usize::try_from(u64::from_le_bytes(
339            len.try_into().context("Too short symbol value")?,
340        ))
341        .context("Length overflow")?;
342        (ptr, len)
343    } else {
344        let (ptr, len) = value_slice[..8].split_at(4);
345        let ptr = usize::try_from(u32::from_le_bytes(
346            ptr.try_into().context("Too short symbol value")?,
347        ))
348        .context("Pointer overflow")?;
349        let len = usize::try_from(u32::from_le_bytes(
350            len.try_into().context("Too short symbol value")?,
351        ))
352        .context("Length overflow")?;
353        (ptr, len)
354    };
355    let chunk = &full_library_content[ptr - shift..ptr - shift + len];
356    serde_json::from_slice(chunk).with_context(|| {
357        format!(
358            "Failed to parse introspection chunk: '{}'",
359            String::from_utf8_lossy(chunk)
360        )
361    })
362}
363
364fn is_introspection_symbol(name: &str) -> bool {
365    name.strip_prefix('_')
366        .unwrap_or(name)
367        .starts_with("PYO3_INTROSPECTION_0_")
368}
369
370#[derive(Deserialize)]
371#[serde(tag = "type", rename_all = "lowercase")]
372enum Chunk {
373    Module {
374        id: String,
375        name: String,
376        members: Vec<String>,
377        consts: Vec<ConstChunk>,
378    },
379    Class {
380        id: String,
381        name: String,
382    },
383    Function {
384        #[serde(default)]
385        id: Option<String>,
386        name: String,
387        arguments: Box<ChunkArguments>,
388        #[serde(default)]
389        parent: Option<String>,
390        #[serde(default)]
391        decorators: Vec<String>,
392        #[serde(default)]
393        returns: Option<String>,
394    },
395}
396
397#[derive(Deserialize)]
398struct ConstChunk {
399    name: String,
400    value: String,
401}
402
403#[derive(Deserialize)]
404struct ChunkArguments {
405    #[serde(default)]
406    posonlyargs: Vec<ChunkArgument>,
407    #[serde(default)]
408    args: Vec<ChunkArgument>,
409    #[serde(default)]
410    vararg: Option<ChunkArgument>,
411    #[serde(default)]
412    kwonlyargs: Vec<ChunkArgument>,
413    #[serde(default)]
414    kwarg: Option<ChunkArgument>,
415}
416
417#[derive(Deserialize)]
418struct ChunkArgument {
419    name: String,
420    #[serde(default)]
421    default: Option<String>,
422    #[serde(default)]
423    annotation: Option<String>,
424}
⚠️ Internal Docs ⚠️ Not Public API 👉 Official Docs Here