gerritsangel
diff --git a/‎mlir/include/mlir/Dialect/GPU/GPUOps.td
+13-3 b/‎mlir/include/mlir/Dialect/GPU/GPUOps.td
+13-3
diff --git a/‎mlir/include/mlir/ExecutionEngine/RunnerUtils.h
+2 b/‎mlir/include/mlir/ExecutionEngine/RunnerUtils.h
+2
diff --git a/‎mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+36-3 b/‎mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+36-3
diff --git a/‎mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+8 b/‎mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+8
diff --git a/‎mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
+29 b/‎mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
+29
diff --git a/‎mlir/lib/ExecutionEngine/RunnerUtils.cpp
+24-5 b/‎mlir/lib/ExecutionEngine/RunnerUtils.cpp
+24-5
@@ -482,15 +482,25 @@ def GPU_YieldOp : GPU_Op<"yield", [Terminator]>,
   }];
 }
 
-// These mirror the XLA ComparisonDirection enum.
+// add, mul mirror the XLA ComparisonDirection enum.
 def GPU_AllReduceOpAdd : StrEnumAttrCase<"add">;
+def GPU_AllReduceOpAnd : StrEnumAttrCase<"and">;
+def GPU_AllReduceOpMax : StrEnumAttrCase<"max">;
+def GPU_AllReduceOpMin : StrEnumAttrCase<"min">;
 def GPU_AllReduceOpMul : StrEnumAttrCase<"mul">;
+def GPU_AllReduceOpOr : StrEnumAttrCase<"or">;
+def GPU_AllReduceOpXor : StrEnumAttrCase<"xor">;
 
 def GPU_AllReduceOperationAttr : StrEnumAttr<"AllReduceOperationAttr",
     "built-in reduction operations supported by gpu.allreduce.",
     [
       GPU_AllReduceOpAdd,
+      GPU_AllReduceOpAnd,
+      GPU_AllReduceOpMax,
+      GPU_AllReduceOpMin,
       GPU_AllReduceOpMul,
+      GPU_AllReduceOpOr,
+      GPU_AllReduceOpXor
     ]>;
 
 def GPU_AllReduceOp : GPU_Op<"all_reduce",
@@ -514,8 +524,8 @@ def GPU_AllReduceOp : GPU_Op<"all_reduce",
     ```
     compute the sum of each work item's %0 value. The first version specifies
     the accumulation as operation, whereas the second version specifies the
-    accumulation as code region. The accumulation operation must either be
-    `add` or `mul`.
+    accumulation as code region. The accumulation operation must be one of:
+    `add`, `and`, `max`, `min`, `mul`, `or`, `xor`.
 
     Either none or all work items of a workgroup need to execute this op
     in convergence.
 
@@ -211,6 +211,8 @@ _mlir_ciface_print_memref_i8(UnrankedMemRefType<int8_t> *M);
 extern "C" MLIR_RUNNERUTILS_EXPORT void
 _mlir_ciface_print_memref_f32(UnrankedMemRefType<float> *M);
 
+extern "C" MLIR_RUNNERUTILS_EXPORT void print_memref_i32(int64_t rank,
+                                                         void *ptr);
 extern "C" MLIR_RUNNERUTILS_EXPORT void print_memref_f32(int64_t rank,
                                                          void *ptr);
 
 
@@ -123,18 +123,51 @@ struct GPUAllReduceOpLowering : public ConvertToLLVMPattern {
       return isFloatingPoint ? getFactory<LLVM::FMulOp>()
                              : getFactory<LLVM::MulOp>();
     }
+    if (opName == "and") {
+      return getFactory<LLVM::AndOp>();
+    }
+    if (opName == "or") {
+      return getFactory<LLVM::OrOp>();
+    }
+    if (opName == "xor") {
+      return getFactory<LLVM::XOrOp>();
+    }
+    if (opName == "max") {
+      return isFloatingPoint ? getCmpFactory<LLVM::FCmpOp, LLVM::FCmpPredicate,
+                                             LLVM::FCmpPredicate::ugt>()
+                             : getCmpFactory<LLVM::ICmpOp, LLVM::ICmpPredicate,
+                                             LLVM::ICmpPredicate::ugt>();
+    }
+    if (opName == "min") {
+      return isFloatingPoint ? getCmpFactory<LLVM::FCmpOp, LLVM::FCmpPredicate,
+                                             LLVM::FCmpPredicate::ult>()
+                             : getCmpFactory<LLVM::ICmpOp, LLVM::ICmpPredicate,
+                                             LLVM::ICmpPredicate::ult>();
+    }
 
     return AccumulatorFactory();
   }
 
   /// Returns an accumulator factory that creates an op of type T.
-  template <typename T> AccumulatorFactory getFactory() const {
+  template <typename T>
+  AccumulatorFactory getFactory() const {
     return [](Location loc, Value lhs, Value rhs,
               ConversionPatternRewriter &rewriter) {
       return rewriter.create<T>(loc, lhs.getType(), lhs, rhs);
     };
   }
 
+  /// Returns an accumulator for comparaison such as min, max. T is the type
+  /// of the compare op.
+  template <typename T, typename PredicateEnum, PredicateEnum predicate>
+  AccumulatorFactory getCmpFactory() const {
+    return [](Location loc, Value lhs, Value rhs,
+              ConversionPatternRewriter &rewriter) {
+      Value cmp = rewriter.create<T>(loc, predicate, lhs, rhs);
+      return rewriter.create<LLVM::SelectOp>(loc, cmp, lhs, rhs);
+    };
+  }
+
   /// Creates an all_reduce across the block.
   ///
   /// First reduce the elements within a warp. The first thread of each warp
@@ -705,9 +738,9 @@ void mlir::populateGpuToNVVMConversionPatterns(
               GPUAllReduceOpLowering, GPUShuffleOpLowering, GPUFuncOpLowering,
               GPUReturnOpLowering>(converter);
   patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
-                                               "__nv_fabs");
+                                                "__nv_fabs");
   patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",
-                                               "__nv_ceil");
+                                                 "__nv_ceil");
   patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__nv_cosf",
                                                "__nv_cos");
   patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__nv_expf",
 
@@ -148,6 +148,14 @@ static LogicalResult verifyAllReduce(gpu::AllReduceOp allReduce) {
     }
     if (yieldCount == 0)
       return allReduce.emitError("expected gpu.yield op in region");
+  } else {
+    StringRef opName = *allReduce.op();
+    if ((opName == "and" || opName == "or" || opName == "xor") &&
+        !allReduce.getType().isa<IntegerType>()) {
+      return allReduce.emitError()
+             << '`' << opName << '`'
+             << " accumulator is only compatible with Integer type";
+    }
   }
   return success();
 }
 
@@ -212,6 +212,25 @@ struct GpuAllReduceRewriter {
       return isFloatingPoint ? getFactory<AddFOp>() : getFactory<AddIOp>();
     if (opName == "mul")
       return isFloatingPoint ? getFactory<MulFOp>() : getFactory<MulIOp>();
+    if (opName == "and") {
+      return getFactory<AndOp>();
+    }
+    if (opName == "or") {
+      return getFactory<OrOp>();
+    }
+    if (opName == "xor") {
+      return getFactory<XOrOp>();
+    }
+    if (opName == "max") {
+      return isFloatingPoint
+                 ? getCmpFactory<CmpFOp, CmpFPredicate, CmpFPredicate::UGT>()
+                 : getCmpFactory<CmpIOp, CmpIPredicate, CmpIPredicate::ugt>();
+    }
+    if (opName == "min") {
+      return isFloatingPoint
+                 ? getCmpFactory<CmpFOp, CmpFPredicate, CmpFPredicate::ULT>()
+                 : getCmpFactory<CmpIOp, CmpIPredicate, CmpIPredicate::ult>();
+    }
     return AccumulatorFactory();
   }
 
@@ -222,6 +241,16 @@ struct GpuAllReduceRewriter {
     };
   }
 
+  /// Returns an accumulator for comparaison such as min, max. T is the type
+  /// of the compare op.
+  template <typename T, typename PredicateEnum, PredicateEnum predicate>
+  AccumulatorFactory getCmpFactory() const {
+    return [&](Value lhs, Value rhs) {
+      Value cmp = rewriter.create<T>(loc, predicate, lhs, rhs);
+      return rewriter.create<SelectOp>(loc, cmp, lhs, rhs);
+    };
+  }
+
   /// Creates an if-block skeleton and calls the two factories to generate the
   /// ops in the `then` and `else` block..
   ///
 
@@ -27,7 +27,7 @@ extern "C" void _mlir_ciface_print_memref_vector_4x4xf32(
 
 extern "C" void _mlir_ciface_print_memref_i8(UnrankedMemRefType<int8_t> *M) {
   printUnrankedMemRefMetaData(std::cout, *M);
-  int rank = M->rank;
+  int64_t rank = M->rank;
   void *ptr = M->descriptor;
 
   switch (rank) {
@@ -41,9 +41,25 @@ extern "C" void _mlir_ciface_print_memref_i8(UnrankedMemRefType<int8_t> *M) {
   }
 }
 
+extern "C" void _mlir_ciface_print_memref_i32(UnrankedMemRefType<int32_t> *M) {
+  printUnrankedMemRefMetaData(std::cout, *M);
+  int64_t rank = M->rank;
+  void *ptr = M->descriptor;
+
+  switch (rank) {
+    MEMREF_CASE(int32_t, 0);
+    MEMREF_CASE(int32_t, 1);
+    MEMREF_CASE(int32_t, 2);
+    MEMREF_CASE(int32_t, 3);
+    MEMREF_CASE(int32_t, 4);
+  default:
+    assert(0 && "Unsupported rank to print");
+  }
+}
+
 extern "C" void _mlir_ciface_print_memref_f32(UnrankedMemRefType<float> *M) {
   printUnrankedMemRefMetaData(std::cout, *M);
-  int rank = M->rank;
+  int64_t rank = M->rank;
   void *ptr = M->descriptor;
 
   switch (rank) {
@@ -57,10 +73,13 @@ extern "C" void _mlir_ciface_print_memref_f32(UnrankedMemRefType<float> *M) {
   }
 }
 
+extern "C" void print_memref_i32(int64_t rank, void *ptr) {
+  UnrankedMemRefType<int32_t> descriptor = {rank, ptr};
+  _mlir_ciface_print_memref_i32(&descriptor);
+}
+
 extern "C" void print_memref_f32(int64_t rank, void *ptr) {
-  UnrankedMemRefType<float> descriptor;
-  descriptor.rank = rank;
-  descriptor.descriptor = ptr;
+  UnrankedMemRefType<float> descriptor = {rank, ptr};
   _mlir_ciface_print_memref_f32(&descriptor);
 }
Original file line number	Diff line number	Diff line change
`@@ -148,6 +148,14 @@ static LogicalResult verifyAllReduce(gpu::AllReduceOp allReduce) {`
`148`	`148`	`}`
`149`	`149`	`if (yieldCount == 0)`
`150`	`150`	`return allReduce.emitError("expected gpu.yield op in region");`
	`151`	`+ } else {`
	`152`	`+ StringRef opName = *allReduce.op();`
	`153`	`+ if ((opName == "and" \|\| opName == "or" \|\| opName == "xor") &&`
	`154`	`+ !allReduce.getType().isa<IntegerType>()) {`
	`155`	`+ return allReduce.emitError()`
	`156`	+ << '`' << opName << '`'
	`157`	`+ << " accumulator is only compatible with Integer type";`
	`158`	`+ }`
`151`	`159`	`}`
`152`	`160`	`return success();`
`153`	`161`	`}`