ggerganov · balisujohn · Jun 18, 2024 · Jun 18, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/include/ggml.h b/include/ggml.h
@@ -1689,6 +1689,7 @@ extern "C" {
             int                   ne3);
 
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
+    // only supports postfix padding
     GGML_API struct ggml_tensor * ggml_pad(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1697,6 +1698,21 @@ extern "C" {
             int                  p2,
             int                  p3);
 
+
+    // pad each dimension with zeros: [x, ..., x] -> [0, ..., 0, x, ..., x, 0, ..., 0]
+    // supports prefix and postfix padding
+    GGML_API struct ggml_tensor * ggml_pad_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  p00,
+            int                  p01,
+            int                  p10,
+            int                  p11,
+            int                  p20,
+            int                  p21,
+            int                  p30,
+            int                  p31);
+
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]
     // return: [N, dim]

diff --git a/src/ggml-cuda/pad.cu b/src/ggml-cuda/pad.cu
@@ -1,6 +1,6 @@
 #include "pad.cuh"
 
-static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
+static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03, const int p00, const int p10, const int p20) {
     // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
     // blockIdx.y: idx of ne1
     // blockIDx.x: idx of ne0 / BLOCK_SIZE
@@ -14,11 +14,13 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
         nidx +
         blockIdx.y * ne0 +
         blockIdx.z * ne0 * gridDim.y;
-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
+
+    if (nidx <  ne00 + p00 && blockIdx.y <  ne01 + p10 && blockIdx.z <  (ne02*ne03) + p20 &&
+        nidx >=        p00 && blockIdx.y >=        p10 && blockIdx.z >=               p20) {
         int offset_src =
-            nidx +
-            blockIdx.y * ne00 +
-            blockIdx.z * ne00 * ne01;
+            (nidx - p00) +
+            (blockIdx.y - p10) * ne00 +
+            (blockIdx.z - p20) * ne00 * ne01;
         dst[offset_dst] = x[offset_src];
     } else {
         dst[offset_dst] = 0.0f;
@@ -27,23 +29,35 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
 
 static void pad_f32_cuda(const float * x, float * dst,
     const int ne00, const int ne01, const int ne02, const int ne03,
-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    const int ne0,  const int ne1,  const int ne2,  const int ne3,
+    const int p00,  const int p10,  const int p20, cudaStream_t stream) {
     int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
     dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03, p00, p10, p20);
 }
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const float * src0_d = (const float *)src0->data;
+
     float * dst_d = (float *)dst->data;
     cudaStream_t stream = ctx.stream();
 
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
     GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
 
+    const int32_t * opts = (const int32_t *)dst->op_params;
+
+    const int32_t p00 = opts[0];
+    const int32_t p10 = opts[1];
+    const int32_t p20 = opts[2];
+    //const int32_t p30 = opts[3];
+
     pad_f32_cuda(src0_d, dst_d,
         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+         dst->ne[0],  dst->ne[1],  dst->ne[2],  dst->ne[3], p00, p10, p20, stream);
 }
diff --git a/src/ggml-metal.m b/src/ggml-metal.m
@@ -2466,6 +2466,11 @@ static enum ggml_status ggml_metal_graph_compute(
                     {
                         GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
+                        const int64_t p00 = ((int32_t *) dst->op_params)[0];
+                        const int64_t p10 = ((int32_t *) dst->op_params)[1];
+                        const int64_t p20 = ((int32_t *) dst->op_params)[2];
+                        const int64_t p30 = ((int32_t *) dst->op_params)[3];
+
                         id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
 
                         [encoder setComputePipelineState:pipeline];
@@ -2487,6 +2492,10 @@ static enum ggml_status ggml_metal_graph_compute(
                         [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
                         [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
                         [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                        [encoder setBytes:&p00  length:sizeof(p00)  atIndex:18];
+                        [encoder setBytes:&p10  length:sizeof(p10)  atIndex:19];
+                        [encoder setBytes:&p20  length:sizeof(p20)  atIndex:20];
+                        [encoder setBytes:&p30  length:sizeof(p30)  atIndex:21];
 
                         const int nth = MIN(1024, ne0);
 

diff --git a/src/ggml-metal.metal b/src/ggml-metal.metal
@@ -1968,6 +1968,10 @@ kernel void kernel_pad_f32(
     constant  uint64_t & nb1,
     constant  uint64_t & nb2,
     constant  uint64_t & nb3,
+    constant   int64_t & p00,
+    constant   int64_t & p10,
+    constant   int64_t & p20,
+    constant   int64_t & p30,
     uint3 tgpig[[threadgroup_position_in_grid]],
     uint3 tpitg[[thread_position_in_threadgroup]],
     uint3   ntg[[threads_per_threadgroup]]) {
@@ -1980,23 +1984,18 @@ kernel void kernel_pad_f32(
     const int64_t i02 = i2;
     const int64_t i01 = i1;
 
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        device       float * dst_ptr  = (device       float *) (dst  + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 
-    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-            if (i0 < ne00) {
-                dst_ptr[i0] = src0_ptr[i0];
-            } else {
-                dst_ptr[i0] = 0.0f;
-            }
-        }
+        if (i0 <  ne00 + p00 && i1 <  ne01 + p10 && i2 <  ne02 + p20 && i3 <  ne03 + p30 &&
+            i0 >=        p00 && i1 >=        p10 && i2 >=        p20 && i3 >=        p30) {
 
-        return;
-    }
+            device const float * src0_ptr = (device const float *) (src0 + (i03 - p30)*nb03 + (i02 - p20)*nb02 + (i01 - p10)*nb01 + (i0 - p00)*nb00);
 
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        dst_ptr[i0] = 0.0f;
+            dst_ptr[0] = src0_ptr[0];
+        } else {
+            dst_ptr[0] = 0.0f;
+        }
     }
 }
 

diff --git a/src/ggml.c b/src/ggml.c
@@ -6823,12 +6823,10 @@ struct ggml_tensor * ggml_upscale_ext(
     return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
 }
 
-// ggml_pad
-
-struct ggml_tensor * ggml_pad(
+static struct ggml_tensor * ggml_pad_impl(
     struct ggml_context * ctx,
     struct ggml_tensor  * a,
-    int p0, int p1, int p2, int p3) {
+    int p00, int p01, int p10, int p11, int p20, int p21, int p30, int p31) {
     bool is_node = false;
 
     if (a->grad) {
@@ -6837,18 +6835,41 @@ struct ggml_tensor * ggml_pad(
     }
 
     struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] + p0,
-            a->ne[1] + p1,
-            a->ne[2] + p2,
-            a->ne[3] + p3);
+            a->ne[0] + p00 + p01,
+            a->ne[1] + p10 + p11,
+            a->ne[2] + p20 + p21,
+            a->ne[3] + p30 + p31);
 
     result->op = GGML_OP_PAD;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
 
+    int32_t params[] = { p00, p10, p20, p30, }; // the pX1 values can be derived
+    ggml_set_op_params(result, params, sizeof(params));
+
     return result;
 }
 
+// ggml_pad
+
+struct ggml_tensor * ggml_pad(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int p0,
+    int p1,
+    int p2,
+    int p3) {
+    return ggml_pad_impl(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
+}
+
+struct ggml_tensor * ggml_pad_ext(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int p00, int p01, int p10, int p11,
+    int p20, int p21, int p30, int p31) {
+    return ggml_pad_impl(ctx, a, p00, p01, p10, p11, p20, p21, p30, p31);
+}
+
 // ggml_arange
 
 struct ggml_tensor * ggml_arange(
@@ -14788,6 +14809,13 @@ static void ggml_compute_forward_pad_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
+    const int32_t * opts = (const int32_t *)dst->op_params;
+
+    const int32_t p00 = opts[0];
+    const int32_t p10 = opts[1];
+    const int32_t p20 = opts[2];
+    const int32_t p30 = opts[3];
+
     GGML_TENSOR_UNARY_OP_LOCALS
 
     float * dst_ptr = (float *) dst->data;
@@ -14800,12 +14828,13 @@ static void ggml_compute_forward_pad_f32(
                 for (int64_t i3 = 0; i3 < ne3; ++i3) {
                     const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
 
-                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                    const float * src_ptr = (const float *)((char *) src0->data + (i3 - p30)*nb03 + (i2 - p20)*nb02 + (i1 - p10)*nb01 + (i0 - p00)*nb00);
 
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                    if (i0 <  ne00 + p00 && i1 <  ne01 + p10 && i2 <  ne02 + p20 && i3 <  ne03 + p30 &&
+                        i0 >=        p00 && i1 >=        p10 && i2 >=        p20 && i3 >=        p30) {
                         dst_ptr[dst_idx] = *src_ptr;
                     } else {
-                        dst_ptr[dst_idx] = 0;
+                        dst_ptr[dst_idx] = 0.0;
                     }
                 }
             }

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -359,6 +359,14 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 
+#
+# test-pad
+
+set(TEST_TARGET test-pad)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+
 #
 # test-rel-pos
 

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -1539,21 +1539,22 @@ struct test_acc : public test_case {
 struct test_pad : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne_a;
-    const int pad_0;
-    const int pad_1;
+    const std::array<int64_t, 4> p0;
+    const std::array<int64_t, 4> p1;
 
     std::string vars() override {
-        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+        return VARS_TO_STR4(type, ne_a, p0, p1);
     }
 
     test_pad(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
-            int pad_0 = 1, int pad_1 = 1)
-        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
+            std::array<int64_t, 4> p0 = {3, 2, 4, 0},
+            std::array<int64_t, 4> p1 = {2, 5, 1, 0})
+        : type(type), ne_a(ne_a), p0(p0), p1(p1) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
+        ggml_tensor * out = ggml_pad_ext(ctx, a, p0[0], p1[0], p0[1], p1[1], p0[2], p1[2], p0[3], p1[3]);
         return out;
     }
 };