Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose new CUDA APIs for sharing the same memory between processes #1235

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
6 changes: 6 additions & 0 deletions Samples/CudaIPC/CudaIPC.Child/App.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.7"/>
</startup>
</configuration>
3 changes: 3 additions & 0 deletions Samples/CudaIPC/CudaIPC.Child/AssemblyAttributes.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
using System;

[assembly: CLSCompliant(true)]
16 changes: 16 additions & 0 deletions Samples/CudaIPC/CudaIPC.Child/CudaIPC.Child.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>$(LibrarySamplesTargetFrameworks)</TargetFrameworks>
<OutputType>Exe</OutputType>
<LangVersion>8.0</LangVersion>
</PropertyGroup>

<PropertyGroup>
<EnableNETAnalyzers>true</EnableNETAnalyzers>
<AnalysisMode>AllEnabledByDefault</AnalysisMode>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\Src\ILGPU\ILGPU.csproj" />
</ItemGroup>
</Project>
70 changes: 70 additions & 0 deletions Samples/CudaIPC/CudaIPC.Child/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// ---------------------------------------------------------------------------------------
// ILGPU Samples
// Copyright (c) 2021-2022 ILGPU Project
// www.ilgpu.net
//
// File: Program.cs
//
// This file is part of ILGPU and is distributed under the University of Illinois Open
// Source License. See LICENSE.txt for details.
// ---------------------------------------------------------------------------------------

using ILGPU;
using ILGPU.Runtime;
using ILGPU.Runtime.Cuda;
using System;
using System.Globalization;

namespace CudaIPC.Child
{
class Program
{
/// <summary>
/// A simple kernel writing the index to the data view.
/// </summary>
/// <param name="index">The current thread index.</param>
/// <param name="dataView">The view pointing to our memory buffer.</param>
static void SimpleKernel(
Index1D index,
ArrayView<int> dataView)
{
dataView[index] = index;
}

/// <summary>
/// Accepts a cuda device id, an ipc memory handle as hexstring and its length as arguments.
/// It then maps that memory and executes a simple kernel on it.
/// </summary>
static void Main(string[] args)
{
if (args.Length != 3)
{
Console.WriteLine("There should be 3 arguments:");
Console.WriteLine("<device id> <ipc memory handle> <length>");
return;
}

// Parse arguments
int deviceId = int.Parse(args[0], CultureInfo.InvariantCulture);
CudaIpcMemHandle ipcMemHandle = new CudaIpcMemHandle(Convert.FromHexString(args[1]));
int length = int.Parse(args[2], CultureInfo.InvariantCulture);

// Set up the correct accelerator
using Context context = Context.CreateDefault();
CudaDevice device = context.GetCudaDevice(deviceId);
using CudaAccelerator accelerator = device.CreateCudaAccelerator(context);
// device.PrintInformation();

// Map exported memory
MemoryBuffer cudaIpcMemoryBuffer =
accelerator.MapFromIpcMemHandle(ipcMemHandle, length, sizeof(int), CudaIpcMemFlags.LazyEnablePeerAccess);
ArrayView<int> arrayView = cudaIpcMemoryBuffer.AsArrayView<int>(0, length);

// load and execute kernel
Action<Index1D, ArrayView<int>> loadedSimpleKernel =
accelerator.LoadAutoGroupedStreamKernel<Index1D, ArrayView<int>>(
SimpleKernel);
loadedSimpleKernel(arrayView.IntExtent, arrayView);
}
}
}
6 changes: 6 additions & 0 deletions Samples/CudaIPC/CudaIPC.Host/App.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.7"/>
</startup>
</configuration>
3 changes: 3 additions & 0 deletions Samples/CudaIPC/CudaIPC.Host/AssemblyAttributes.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
using System;

[assembly: CLSCompliant(true)]
17 changes: 17 additions & 0 deletions Samples/CudaIPC/CudaIPC.Host/CudaIPC.Host.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>$(LibrarySamplesTargetFrameworks)</TargetFrameworks>
<OutputType>Exe</OutputType>
<LangVersion>8.0</LangVersion>
</PropertyGroup>

<PropertyGroup>
<EnableNETAnalyzers>true</EnableNETAnalyzers>
<AnalysisMode>AllEnabledByDefault</AnalysisMode>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\Src\ILGPU\ILGPU.csproj" />
<ProjectReference Include="..\CudaIPC.Child\CudaIPC.Child.csproj" />
</ItemGroup>
</Project>
64 changes: 64 additions & 0 deletions Samples/CudaIPC/CudaIPC.Host/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// ---------------------------------------------------------------------------------------
// ILGPU Samples
// Copyright (c) 2021-2022 ILGPU Project
// www.ilgpu.net
//
// File: Program.cs
//
// This file is part of ILGPU and is distributed under the University of Illinois Open
// Source License. See LICENSE.txt for details.
// ---------------------------------------------------------------------------------------

using ILGPU;
using ILGPU.Runtime;
using ILGPU.Runtime.Cuda;
using System;
using System.Diagnostics;

namespace CudaIPC.Host
{
class Program
{
/// <summary>
/// Exports memory for other processes using CUDA IPC.
/// </summary>
static void Main()
{
// Create main context
using var context = Context.CreateDefault();

// For each available CUDA device...
foreach (var device in context.GetCudaDevices())
{
// Create accelerator for the given device
using CudaAccelerator accelerator = device.CreateCudaAccelerator(context);

if (!device.HasIpcSupport)
{
Console.WriteLine($"{device.Name} does not support inter process comunication!");
continue;
}

using MemoryBuffer1D<int, Stride1D.Dense> buffer = accelerator.Allocate1D<int>(64);

// Export memory for other processes
CudaIpcMemHandle cudaIpcMemHandle = accelerator.GetIpcMemoryHandle(buffer);
string handleHex = Convert.ToHexString(cudaIpcMemHandle);

// Launch CudaIPC.Child
var arguments = $"{device.DeviceId} {handleHex} {buffer.Length}";
Console.WriteLine(arguments);
var childProcess = Process.Start(
OperatingSystem.IsWindows() ?
"CudaIPC.Child.exe" : "CudaIPC.Child",
arguments
);
childProcess?.WaitForExit();

// Gets changed buffer data onto the CPU and print it.
int[] bufferData = buffer.GetAsArray1D();
Console.WriteLine(String.Join(" ", bufferData));
}
}
}
}
14 changes: 14 additions & 0 deletions Samples/ILGPU.Samples.sln
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ILGPU.Analyzers", "..\Src\I
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "InterleaveFields", "InterleaveFields\InterleaveFields.csproj", "{1E6D0BC6-CFA1-4F52-9EB9-CAA62DD2F33A}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CudaIPC.Host", "CudaIPC\CudaIPC.Host\CudaIPC.Host.csproj", "{CD0EB089-13B7-4229-AAC1-A8E586E46146}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CudaIPC.Child", "CudaIPC\CudaIPC.Child\CudaIPC.Child.csproj", "{6CFF471A-6CAB-481C-AEB8-464EF3700910}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -381,6 +385,14 @@ Global
{1E6D0BC6-CFA1-4F52-9EB9-CAA62DD2F33A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1E6D0BC6-CFA1-4F52-9EB9-CAA62DD2F33A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1E6D0BC6-CFA1-4F52-9EB9-CAA62DD2F33A}.Release|Any CPU.Build.0 = Release|Any CPU
{CD0EB089-13B7-4229-AAC1-A8E586E46146}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{CD0EB089-13B7-4229-AAC1-A8E586E46146}.Debug|Any CPU.Build.0 = Debug|Any CPU
{CD0EB089-13B7-4229-AAC1-A8E586E46146}.Release|Any CPU.ActiveCfg = Release|Any CPU
{CD0EB089-13B7-4229-AAC1-A8E586E46146}.Release|Any CPU.Build.0 = Release|Any CPU
{6CFF471A-6CAB-481C-AEB8-464EF3700910}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{6CFF471A-6CAB-481C-AEB8-464EF3700910}.Debug|Any CPU.Build.0 = Debug|Any CPU
{6CFF471A-6CAB-481C-AEB8-464EF3700910}.Release|Any CPU.ActiveCfg = Release|Any CPU
{6CFF471A-6CAB-481C-AEB8-464EF3700910}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -449,6 +461,8 @@ Global
{70B69CE3-24A9-463C-B14C-E2934988BBEE} = {25BA2234-5778-40BC-9386-9CE87AB87D1F}
{1C5E9E39-3C14-4B52-8D97-04555D5F6331} = {03FCC663-945D-4982-90D8-B14BE52D8FCD}
{1E6D0BC6-CFA1-4F52-9EB9-CAA62DD2F33A} = {C1D99632-ED4A-4B08-A14D-4C8DB375934F}
{CD0EB089-13B7-4229-AAC1-A8E586E46146} = {C1D99632-ED4A-4B08-A14D-4C8DB375934F}
{6CFF471A-6CAB-481C-AEB8-464EF3700910} = {C1D99632-ED4A-4B08-A14D-4C8DB375934F}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {30E502BD-3826-417F-888F-1CE19CF5C6DA}
Expand Down
92 changes: 90 additions & 2 deletions Src/ILGPU/Runtime/Cuda/CudaAPI.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// ---------------------------------------------------------------------------------------
// ILGPU
// Copyright (c) 2020-2023 ILGPU Project
// Copyright (c) 2024 ILGPU Project
// www.ilgpu.net
//
// File: CudaAPI.cs
Expand Down Expand Up @@ -484,6 +484,57 @@ public CudaError MemHostGetDevicePointer(
int flags) =>
cuMemHostGetDevicePointer_v2(out devicePtr, hostPtr, flags);

/// <summary>
/// Get an IPC memory handle for a memory buffer.
/// </summary>
/// <param name="handle">The IPC memory handle.</param>
/// <param name="devicePtr">The memory buffer.</param>
/// <returns>The error status.</returns>
/// <remarks>This will zero the memory in the buffer and
/// in case of small allocations under 1 MB even neighboring memory will be zeroed.<br />
/// A buffer can only have one IPC memory handle, but multiple processes can use the same handle.
/// </remarks>
public CudaError GetIpcMemoryHandle(
out CudaIpcMemHandle handle,
IntPtr devicePtr)
{
handle = new CudaIpcMemHandle();
fixed (byte* handlePtr = handle.Data)
{
return cuIpcGetMemHandle(handlePtr, devicePtr);
}
}

/// <summary>
/// Open a memory buffer from an IPC handle.
/// </summary>
/// <param name="devicePtr">The newly allocated memory.</param>
/// <param name="handle">A IPC memory handle from another process</param>
/// <param name="flags">The flags to use.</param>
/// <returns>The error status.</returns>
/// <remarks>This will not work with an IPC handle of the same process.</remarks>
public CudaError OpenIpcMemoryHandle(
out IntPtr devicePtr,
CudaIpcMemHandle handle,
CudaIpcMemFlags flags)
{
fixed (byte* dataPtr = handle.Data){
return cuIpcOpenMemHandle_v2(out devicePtr, dataPtr, flags);
}
}

/// <summary>
/// Close a memory buffer opened with <see cref="OpenIpcMemoryHandle"/>.
/// </summary>
/// <param name="devicePtr">The memory to close.</param>
/// <returns>The error status.</returns>
/// <remarks> This will decrease the reference count of memory in <paramref name="devicePtr"/> by one,
/// only if the count reaches 0 the memory will be unmapped.
/// The original memory in the exported process and mappings in other processes will be unaffected.
/// </remarks>
public CudaError CloseIpcMemoryHandle(IntPtr devicePtr) =>
cuIpcCloseMemHandle(devicePtr);

#endregion

#region Stream Methods
Expand Down Expand Up @@ -748,7 +799,7 @@ public CudaError LaunchKernel(
kernelArgs);

/// <summary>
/// Computes the maximum number of blocks for maximum occupancy.
/// Computes the maximum number of blocks for maximum occupancy.
/// </summary>
/// <param name="numBlocks">The number of blocks.</param>
/// <param name="func">The function.</param>
Expand Down Expand Up @@ -886,6 +937,43 @@ public CudaError RecordEvent(IntPtr @event, IntPtr stream) =>
public CudaError SynchronizeEvent(IntPtr @event) =>
cuEventSynchronize(@event);

/// <summary>
/// Get an IPC event handle for an evemt
/// </summary>
/// <param name="handle">The IPC event handle.</param>
/// <param name="devicePtr">The event.</param>
/// <returns>The error status.</returns>
public CudaError GetIpcEventHandle(
out CudaIpcEventHandle handle,
IntPtr devicePtr)
{
handle = new CudaIpcEventHandle();
fixed (byte* dataPtr = handle.Data)
{
return cuIpcGetEventHandle(dataPtr, devicePtr);
}
}

/// <summary>
/// Opens an event handle from an IPC handle.
/// </summary>
/// <param name="devicePtr">The newly opened event handle.</param>
/// <param name="handle">A IPC event handle from another process</param>
/// <returns>The error status.</returns>
/// <remarks>
/// This will not work with an IPC handle of the same process.<br />
/// The event will behave like a locally created event
/// </remarks>
public CudaError OpenIpcEventHandle(
out IntPtr devicePtr,
CudaIpcEventHandle handle)
{
fixed (byte* dataPtr = handle.Data)
{
return cuIpcOpenEventHandle(out devicePtr, dataPtr);
}
}

#endregion
}
}
20 changes: 20 additions & 0 deletions Src/ILGPU/Runtime/Cuda/CudaAPI.xml
Original file line number Diff line number Diff line change
Expand Up @@ -217,4 +217,24 @@
<Import Name="cuEventSynchronize">
<Parameter Name="@event" Type="IntPtr" />
</Import>
<Import Name="cuIpcGetEventHandle">
<Parameter Name="handle" Type="byte*" />
<Parameter Name="@event" Type="IntPtr" />
</Import>
<Import Name="cuIpcOpenEventHandle">
<Parameter Name="@event" Type="IntPtr" Flags="Out" />
<Parameter Name="handle" Type="byte*" />
</Import>
<Import Name="cuIpcGetMemHandle">
<Parameter Name="handle" Type="byte*" />
<Parameter Name="devicePtr" Type="IntPtr" />
</Import>
<Import Name="cuIpcOpenMemHandle_v2">
<Parameter Name="devicePtr" Type="IntPtr" Flags="Out"/>
<Parameter Name="handle" Type="byte*" />
<Parameter Name="flags" Type="CudaIpcMemFlags" />
</Import>
<Import Name="cuIpcCloseMemHandle">
<Parameter Name="devicePtr" Type="IntPtr" />
</Import>
</Imports>
Loading