Casper O. da Costa-Luis
github/casperdcl
ctypes
https://docs.python.org/3/library/ctypes.html
ctypes
exampleimport ctypes
lib = ctypes.CDLL("./libmy_experiment.so")
# (re)define function signature for `void myfunc(float*, size_t)`
lib.myfunc.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
lib.myfunc.restype = None
input_array = np.array([1, 2, 3], dtype=np.float32)
# call function
lib.myfunc(
input_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_size_t(len(input_array)))
https://docs.python.org/3/c-api/intro.html
#include <Python.h>
#include <numpy/arrayobject.h>
static PyObject *myfunc(PyObject *self, PyObject *args) {
PyObject *arr = NULL;
if (!PyArg_ParseTuple(args, "O", &arr)) return NULL;
PyArrayObject *np_arr = PyArray_FROM_OTF(
arr, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY);
float *arr_ptr = PyArray_DATA(np_arr);
npy_intp *size = PyArray_SHAPE(np_arr);
for (size_t i = 0; i < size[0]; ++i) arr_ptr[i] *= 2;
}
static struct PyModuleDef my_module = {
PyModuleDef_HEAD_INIT,
.m_name = "my_experiment"
};
static PyMethodDef my_methods[] = {
{"myfunc", myfunc, METH_VARARGS, "In-place modifies ndarray[float]"},
{NULL, NULL, 0, NULL} // Sentinel
};
PyMODINIT_FUNC PyInit_spam(void) {
import_array();
return PyModule_Create(&my_module);
}
with my_experiment.{so,dll}
in
PYTHONPATH
:
https://docs.python.org/3/c-api/buffer.html
struct
to expose arraysfloat *data
size_t ndim
size_t shape[]
https://github.com/AMYPAD/CuVec/blob/main/cuvec/include/cuvec_cpython.cuh#L128
int my_buffer(PyObject *obj, Py_buffer *view, int flags) {
view->buf = (void *)MY_GET_ARR_PTR(obj);
view->obj = obj;
view->len = MY_GET_ARR_SIZE(obj) * sizeof(float);
view->readonly = 0;
view->itemsize = sizeof(float);
view->format = "f";
view->ndim = MY_GET_ARR_NDIM(obj);
view->shape = MY_GET_ARR_SHAPE(obj);
view->strides = MY_GET_ARR_STRIDES(obj);
view->suboffsets = NULL; view->internal = NULL;
Py_INCREF(view->obj);
return 0;
}
pybind11
, etc)https://pybind11.readthedocs.io/en/stable/
#include <pybind11/pybind11.h>
void myfunc(pybind11::buffer view) {
pybind11::buffer_info arr = view.request();
float *ptr = arr.ptr;
if (arr.ndim != 1) throw std::runtime_error("expected 1D array");
for (size_t i = 0; i < arr.size; ++i) ptr[i] *= 2;
}
using namespace pybind11::literals;
PYBIND11_MODULE(my_examples, m){
m.def("myfunc", &myfunc, "input_array"_a,
"In-place modifies ndarray[float]");
}
__cuda_array_interface__
https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
__dlpack__
https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html
https://dmlc.github.io/dlpack/latest/python_spec.html#reference-implementations
__cuda_array_interface__
but also handles ownership & multiple devices/streams@property
def __dlpack__(self, copy=False, max_version: tuple[int]=None,
stream: int=None, dl_device: tuple[int]=None
):
dl_tensor = DLTensor(data=self.addr, device=dl_device, ndim=self.ndim,
dtype=DLDataType.from_dtype(self.dtype),
shape=ctypes.cast(self.shape, ctypes.POINTER(ctypes.c_int64)),
strides=None, byte_offset=0)
managed_tensor = DLManagedTensor(dl_tensor=dl_tensor, manager_ctx=0,
deleter=DLTensorDeleter(lambda addr: None))
return pythonapi.PyCapsule_New(ctypes.byref(managed_tensor), b'dltensor', None)
#include <cuda_runtime.h>
int N = ...;
float cpu_data[N];
for (int i = 0; i < N; ++i) data[i] = ...;
float *gpu_data;
cudaMalloc(&gpu_data, sizeof(float) * N);
cudaMemcpy(gpu_data, cpu_data, sizeof(float) * N, cudaMemcpyHostToDevice);
mykernel<<<1, N>>>(gpu_data, N);
cudaDeviceSynchronize();
cudaMemcpy(cpu_data, gpu_data, sizeof(float) * N, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaFree(gpu_data);
https://amypad.github.io/CuVec/
std::vector<T, {malloc, free}>
→
std::vector<T, {cudaMallocManaged, cudaFree}>
std::vector<T>::data()
std::vector<T>::size()
std::vector<T>::resize()
numpy.ndarray
https://amypad.github.io/NumCu/
Minimal Python/C++/CUDA library using CuVec
’s CPython
buffer
protocol.
pip install
pyproject.toml::build-system
requires = [cmake, scikit-build-core, pybind11, ...]
cmake
*.{py,so,dll}