diff --git a/xla/hlo/transforms/host_offloading_prepare.h b/xla/hlo/transforms/host_offloading_prepare.h index 1fbea3b167880..d45336e6111a0 100644 --- a/xla/hlo/transforms/host_offloading_prepare.h +++ b/xla/hlo/transforms/host_offloading_prepare.h @@ -28,20 +28,21 @@ namespace xla { -// This is a collection of rewrites that prepares HLO module for host -// offloading, mainly to work around different limitation of the compilation -// pipeline and runtime. These rewrites can be placed in a different parts of +// This is a collection of rewrites that prepares an HLO module for host +// offloading. These rewrites can be placed in a different parts of // the overall compilation pipeline to prepare HLO module for host offloading -// for the given backend (different backends have different limitations). +// for the given backend. class HostOffloadingPrepare : public HloModulePass { public: enum class Rewrite { - // Currently host compute offloading requires that all temporary inputs are - // in device memory. If they are streamed inputs (inputs to the entry - // computation), they can be in either device or host memory. - // // This rewrite removes `MoveToHost` custom calls that feed directly into - // the computation offloading to the host. + // the a host computation. + // + // In the HLO, it will look like HBM is directly fed into the host + // computation. The runtime will, once the async-call-start is executed, + // allocate a buffer on the host and copy the HBM buffer into it. This has + // the benefit that the device will never be blocking directly on the + // tranfser, since that's clumped together with the computation. kElideMoveToHost, // Currently host compute offloading does not support tiled layouts, and