Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adds ggml_pad_ext to allow prefix padding #864

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1689,6 +1689,7 @@ extern "C" {
int ne3);

// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
// only supports postfix padding
GGML_API struct ggml_tensor * ggml_pad(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand All @@ -1697,6 +1698,21 @@ extern "C" {
int p2,
int p3);


// pad each dimension with zeros: [x, ..., x] -> [0, ..., 0, x, ..., x, 0, ..., 0]
// supports prefix and postfix padding
GGML_API struct ggml_tensor * ggml_pad_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p00,
int p01,
int p10,
int p11,
int p20,
int p21,
int p30,
int p31);

// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
Expand Down
32 changes: 23 additions & 9 deletions src/ggml-cuda/pad.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "pad.cuh"

static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03, const int p00, const int p10, const int p20) {
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
// blockIdx.y: idx of ne1
// blockIDx.x: idx of ne0 / BLOCK_SIZE
Expand All @@ -14,11 +14,13 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {

if (nidx < ne00 + p00 && blockIdx.y < ne01 + p10 && blockIdx.z < (ne02*ne03) + p20 &&
nidx >= p00 && blockIdx.y >= p10 && blockIdx.z >= p20) {
int offset_src =
nidx +
blockIdx.y * ne00 +
blockIdx.z * ne00 * ne01;
(nidx - p00) +
(blockIdx.y - p10) * ne00 +
(blockIdx.z - p20) * ne00 * ne01;
dst[offset_dst] = x[offset_src];
} else {
dst[offset_dst] = 0.0f;
Expand All @@ -27,23 +29,35 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons

static void pad_f32_cuda(const float * x, float * dst,
const int ne00, const int ne01, const int ne02, const int ne03,
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
const int ne0, const int ne1, const int ne2, const int ne3,
const int p00, const int p10, const int p20, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2*ne3);
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03, p00, p10, p20);
}

void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;

float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

GGML_ASSERT(ggml_is_contiguous(src0));

GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

const int32_t * opts = (const int32_t *)dst->op_params;

const int32_t p00 = opts[0];
const int32_t p10 = opts[1];
const int32_t p20 = opts[2];
//const int32_t p30 = opts[3];

pad_f32_cuda(src0_d, dst_d,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], p00, p10, p20, stream);
}
9 changes: 9 additions & 0 deletions src/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -2466,6 +2466,11 @@ static enum ggml_status ggml_metal_graph_compute(
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);

const int64_t p00 = ((int32_t *) dst->op_params)[0];
const int64_t p10 = ((int32_t *) dst->op_params)[1];
const int64_t p20 = ((int32_t *) dst->op_params)[2];
const int64_t p30 = ((int32_t *) dst->op_params)[3];

id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;

[encoder setComputePipelineState:pipeline];
Expand All @@ -2487,6 +2492,10 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&p00 length:sizeof(p00) atIndex:18];
[encoder setBytes:&p10 length:sizeof(p10) atIndex:19];
[encoder setBytes:&p20 length:sizeof(p20) atIndex:20];
[encoder setBytes:&p30 length:sizeof(p30) atIndex:21];

const int nth = MIN(1024, ne0);

Expand Down
27 changes: 13 additions & 14 deletions src/ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -1968,6 +1968,10 @@ kernel void kernel_pad_f32(
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int64_t & p00,
constant int64_t & p10,
constant int64_t & p20,
constant int64_t & p30,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
Expand All @@ -1980,23 +1984,18 @@ kernel void kernel_pad_f32(
const int64_t i02 = i2;
const int64_t i01 = i1;

device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);

if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
if (i0 < ne00) {
dst_ptr[i0] = src0_ptr[i0];
} else {
dst_ptr[i0] = 0.0f;
}
}
if (i0 < ne00 + p00 && i1 < ne01 + p10 && i2 < ne02 + p20 && i3 < ne03 + p30 &&
i0 >= p00 && i1 >= p10 && i2 >= p20 && i3 >= p30) {

return;
}
device const float * src0_ptr = (device const float *) (src0 + (i03 - p30)*nb03 + (i02 - p20)*nb02 + (i01 - p10)*nb01 + (i0 - p00)*nb00);

for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = 0.0f;
dst_ptr[0] = src0_ptr[0];
} else {
dst_ptr[0] = 0.0f;
}
}
}

Expand Down
51 changes: 40 additions & 11 deletions src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -6823,12 +6823,10 @@ struct ggml_tensor * ggml_upscale_ext(
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
}

// ggml_pad

struct ggml_tensor * ggml_pad(
static struct ggml_tensor * ggml_pad_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0, int p1, int p2, int p3) {
int p00, int p01, int p10, int p11, int p20, int p21, int p30, int p31) {
bool is_node = false;

if (a->grad) {
Expand All @@ -6837,18 +6835,41 @@ struct ggml_tensor * ggml_pad(
}

struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
a->ne[0] + p0,
a->ne[1] + p1,
a->ne[2] + p2,
a->ne[3] + p3);
a->ne[0] + p00 + p01,
a->ne[1] + p10 + p11,
a->ne[2] + p20 + p21,
a->ne[3] + p30 + p31);

result->op = GGML_OP_PAD;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;

int32_t params[] = { p00, p10, p20, p30, }; // the pX1 values can be derived
ggml_set_op_params(result, params, sizeof(params));

return result;
}

// ggml_pad

struct ggml_tensor * ggml_pad(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1,
int p2,
int p3) {
return ggml_pad_impl(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
}

struct ggml_tensor * ggml_pad_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p00, int p01, int p10, int p11,
int p20, int p21, int p30, int p31) {
return ggml_pad_impl(ctx, a, p00, p01, p10, p11, p20, p21, p30, p31);
}

// ggml_arange

struct ggml_tensor * ggml_arange(
Expand Down Expand Up @@ -14788,6 +14809,13 @@ static void ggml_compute_forward_pad_f32(
const int ith = params->ith;
const int nth = params->nth;

const int32_t * opts = (const int32_t *)dst->op_params;

const int32_t p00 = opts[0];
const int32_t p10 = opts[1];
const int32_t p20 = opts[2];
const int32_t p30 = opts[3];

GGML_TENSOR_UNARY_OP_LOCALS

float * dst_ptr = (float *) dst->data;
Expand All @@ -14800,12 +14828,13 @@ static void ggml_compute_forward_pad_f32(
for (int64_t i3 = 0; i3 < ne3; ++i3) {
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;

const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
const float * src_ptr = (const float *)((char *) src0->data + (i3 - p30)*nb03 + (i2 - p20)*nb02 + (i1 - p10)*nb01 + (i0 - p00)*nb00);

if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
if (i0 < ne00 + p00 && i1 < ne01 + p10 && i2 < ne02 + p20 && i3 < ne03 + p30 &&
i0 >= p00 && i1 >= p10 && i2 >= p20 && i3 >= p30) {
dst_ptr[dst_idx] = *src_ptr;
} else {
dst_ptr[dst_idx] = 0;
dst_ptr[dst_idx] = 0.0;
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,14 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)

#
# test-pad

set(TEST_TARGET test-pad)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)

#
# test-rel-pos

Expand Down
13 changes: 7 additions & 6 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1539,21 +1539,22 @@ struct test_acc : public test_case {
struct test_pad : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne_a;
const int pad_0;
const int pad_1;
const std::array<int64_t, 4> p0;
const std::array<int64_t, 4> p1;

std::string vars() override {
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
return VARS_TO_STR4(type, ne_a, p0, p1);
}

test_pad(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
int pad_0 = 1, int pad_1 = 1)
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
std::array<int64_t, 4> p0 = {3, 2, 4, 0},
std::array<int64_t, 4> p1 = {2, 5, 1, 0})
: type(type), ne_a(ne_a), p0(p0), p1(p1) {}

ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
ggml_tensor * out = ggml_pad_ext(ctx, a, p0[0], p1[0], p0[1], p1[1], p0[2], p1[2], p0[3], p1[3]);
return out;
}
};
Expand Down
Loading
Loading