From 265921b4a307d71bfc408b8ab927501d79d77973 Mon Sep 17 00:00:00 2001
From: contre <rpastric@contre.us>
Date: Wed, 24 Jul 2019 10:58:19 -0700
Subject: [PATCH] [pcl] Fix cuda building compatability issues with cuda 10.1
 (#7388)

---
 ports/pcl/CONTROL         |   2 +-
 ports/pcl/cuda_10_1.patch | 133 ++++++++++++++++++++++++++++++++++++++
 ports/pcl/portfile.cmake  |   1 +
 3 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 ports/pcl/cuda_10_1.patch

diff --git a/ports/pcl/CONTROL b/ports/pcl/CONTROL
index 1a9852c8e9..41714a66aa 100644
--- a/ports/pcl/CONTROL
+++ b/ports/pcl/CONTROL
@@ -1,5 +1,5 @@
 Source: pcl
-Version: 1.9.1-4
+Version: 1.9.1-5
 Homepage: https://github.com/PointCloudLibrary/pcl
 Description: Point Cloud Library (PCL) is open source library for 2D/3D image and point cloud processing.
 Build-Depends: eigen3, flann, qhull, vtk, libpng, boost-system, boost-filesystem, boost-thread, boost-date-time, boost-iostreams, boost-random, boost-foreach, boost-dynamic-bitset, boost-property-map, boost-graph, boost-multi-array, boost-signals2, boost-ptr-container, boost-uuid, boost-interprocess, boost-asio
diff --git a/ports/pcl/cuda_10_1.patch b/ports/pcl/cuda_10_1.patch
new file mode 100644
index 0000000000..4b46149a4b
--- /dev/null
+++ b/ports/pcl/cuda_10_1.patch
@@ -0,0 +1,133 @@
+diff --git a/cuda/common/include/pcl/cuda/thrust.h b/cuda/common/include/pcl/cuda/thrust.h
+index 57586ab..af073d7 100644
+--- a/cuda/common/include/pcl/cuda/thrust.h
++++ b/cuda/common/include/pcl/cuda/thrust.h
+@@ -42,6 +42,7 @@
+
+ #include <thrust/host_vector.h>
+ #include <thrust/device_vector.h>
++#include <thrust/device_malloc.h>
+ #include <thrust/copy.h>
+ #include <thrust/device_ptr.h>
+ #include <thrust/sequence.h>
+diff --git a/gpu/features/src/fpfh.cu b/gpu/features/src/fpfh.cu
+index 8d34f76..59a5f0c 100644
+--- a/gpu/features/src/fpfh.cu
++++ b/gpu/features/src/fpfh.cu
+@@ -231,12 +231,12 @@ namespace pcl
+                 int *sinds = sindices + Warp::WARP_SIZE * warp_idx;
+                 int size = sizes[idx];
+ 
+-                for(int i = lane; __any(i < size); i += Warp::STRIDE)                
++                for(int i = lane; __any_sync(0xFFFFFFFF, i < size); i += Warp::STRIDE)                
+                 {
+                     if (i < size)
+                         sinds[lane] = ginds[i];
+ 
+-                    int inds_num = __popc(__ballot(i < size));
++                    int inds_num = __popc(__ballot_sync(0xFFFFFFFF, i < size));
+ 
+                     for(int j = 0; j < inds_num; ++j)
+                     {
+diff --git a/gpu/octree/src/cuda/approx_nsearch.cu b/gpu/octree/src/cuda/approx_nsearch.cu
+index e457255..3e1adfe 100644
+--- a/gpu/octree/src/cuda/approx_nsearch.cu
++++ b/gpu/octree/src/cuda/approx_nsearch.cu
+@@ -141,7 +141,7 @@ namespace pcl { namespace device { namespace appnearest_search
+ 		{   
+             __shared__ volatile int  per_warp_buffer[KernelPolicy::WARPS_COUNT];
+ 
+-			int mask = __ballot(node_idx != -1);                        
++			int mask = __ballot_sync(0xFFFFFFFF, node_idx != -1);                        
+ 
+ 			while(mask)
+ 			{                
+@@ -275,7 +275,7 @@ namespace pcl { namespace device { namespace appnearest_search
+ 
+ 		bool active = query_index < batch.queries_num;
+ 
+-		if (__all(active == false)) 
++		if (__all_sync(0xFFFFFFFF, active == false)) 
+ 			return;
+ 
+ 		Warp_appNearestSearch search(batch, query_index);
+diff --git a/gpu/octree/src/cuda/knn_search.cu b/gpu/octree/src/cuda/knn_search.cu
+index a99655d..b55e3c1 100644
+--- a/gpu/octree/src/cuda/knn_search.cu
++++ b/gpu/octree/src/cuda/knn_search.cu
+@@ -106,7 +106,7 @@ namespace pcl { namespace device { namespace knn_search
+             else                
+                 query_index = -1;    
+ 
+-            while(__any(active))
++            while(__any_sync(0xFFFFFFFF, active))
+             {                
+                 int leaf = -1;
+            
+@@ -163,7 +163,7 @@ namespace pcl { namespace device { namespace knn_search
+ 
+         __device__ __forceinline__ void processLeaf(int node_idx)
+         {   
+-            int mask = __ballot(node_idx != -1);            
++            int mask = __ballot_sync(0xFFFFFFFF, node_idx != -1);            
+ 
+             unsigned int laneId = Warp::laneId();
+             unsigned int warpId = Warp::id();
+@@ -310,7 +310,7 @@ namespace pcl { namespace device { namespace knn_search
+                                 
+         bool active = query_index < batch.queries_num;
+ 
+-        if (__all(active == false)) 
++        if (__all_sync(0xFFFFFFFF, active == false)) 
+             return;
+ 
+         Warp_knnSearch search(batch, query_index);
+diff --git a/gpu/octree/src/cuda/radius_search.cu b/gpu/octree/src/cuda/radius_search.cu
+index f90273f..8ae84e7 100644
+--- a/gpu/octree/src/cuda/radius_search.cu
++++ b/gpu/octree/src/cuda/radius_search.cu
+@@ -155,7 +155,7 @@ namespace pcl
+                 else                
+                     query_index = -1;
+ 
+-                while(__any(active))
++                while(__any_sync(0xFFFFFFFF, active))
+                 {                
+                     int leaf = -1;                
+ 
+@@ -217,7 +217,7 @@ namespace pcl
+ 
+             __device__ __forceinline__ void processLeaf(int leaf)
+             {   
+-                int mask = __ballot(leaf != -1);            
++                int mask = __ballot_sync(0xFFFFFFFF, leaf != -1);            
+ 
+                 while(mask)
+                 {                
+@@ -255,7 +255,7 @@ namespace pcl
+                     int *out = batch.output + active_query_index * batch.max_results + active_found_count;                    
+                     int length_left = batch.max_results - active_found_count;
+ 
+-                    int test = __any(active_lane == laneId && (leaf & KernelPolicy::CHECK_FLAG));
++                    int test = __any_sync(0xFFFFFFFF, active_lane == laneId && (leaf & KernelPolicy::CHECK_FLAG));
+ 
+                     if (test)
+                     {                                        
+@@ -329,7 +329,7 @@ namespace pcl
+                     total_new += new_nodes;
+                     out += new_nodes;                
+ 
+-                    if (__all(idx >= length) || __any(out_of_bounds) || total_new == length_left)
++                    if (__all_sync(0xFFFFFFFF, idx >= length) || __any_sync(0xFFFFFFFF, out_of_bounds) || total_new == length_left)
+                         break;
+                 }
+                 return min(total_new, length_left);
+@@ -343,7 +343,7 @@ namespace pcl
+ 
+             bool active = query_index < batch.queries.size;
+ 
+-            if (__all(active == false)) 
++            if (__all_sync(0xFFFFFFFF, active == false)) 
+                 return;
+ 
+             Warp_radiusSearch<BatchType> search(batch, query_index);
diff --git a/ports/pcl/portfile.cmake b/ports/pcl/portfile.cmake
index b30ceb0d8e..b855831e9d 100644
--- a/ports/pcl/portfile.cmake
+++ b/ports/pcl/portfile.cmake
@@ -11,6 +11,7 @@ vcpkg_from_github(
         pcl_config.patch
         use_flann_targets.patch
         boost-1.70.patch
+        cuda_10_1.patch
 )
 
 file(REMOVE ${SOURCE_PATH}/cmake/Modules/FindFLANN.cmake)