src/Seeding/CreateSeedsForGroupSycl.cpp

0001 // This file is part of the Acts project.
0002 //
0003 // Copyright (C) 2020-2021 CERN for the benefit of the Acts project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
0008
0009 // System include(s)
0010 #include <algorithm>
0011 #include <cstdint>
0012 #include <cstring>
0013 #include <exception>
0014 #include <functional>
0015 #include <memory>
0016 #include <vector>
0017
0018 // Acts include(s)
0019 #include "Acts/Utilities/Logger.hpp"
0020
0021 // SYCL plugin include(s)
0022 #include "Acts/Plugins/Sycl/Seeding/CreateSeedsForGroupSycl.hpp"
0023 #include "Acts/Plugins/Sycl/Seeding/detail/Types.hpp"
0024 #include "Acts/Plugins/Sycl/Utilities/CalculateNdRange.hpp"
0025
0026 #include "../Utilities/Arrays.hpp"
0027 #include "DupletSearch.hpp"
0028 #include "LinearTransform.hpp"
0029 #include "TripletFilter.hpp"
0030 #include "TripletSearch.hpp"
0031
0032 // VecMem include(s).
0033 #include "vecmem/containers/data/jagged_vector_buffer.hpp"
0034 #include "vecmem/containers/data/vector_buffer.hpp"
0035 #include "vecmem/utils/sycl/copy.hpp"
0036
0037 // SYCL include
0038 #include <CL/sycl.hpp>
0039
0040 namespace Acts::Sycl {
0041 // Kernel classes in order of execution.
0042 class ind_copy_bottom_kernel;
0043 class ind_copy_top_kernel;
0044 class triplet_search_kernel;
0045 class filter_2sp_fixed_kernel;
0046
0047 void createSeedsForGroupSycl(
0048     QueueWrapper wrappedQueue, vecmem::memory_resource& resource,
0049     vecmem::memory_resource* device_resource,
0050     const detail::DeviceSeedFinderConfig& seedFinderConfig,
0051     const DeviceExperimentCuts& deviceCuts,
0052     vecmem::vector<detail::DeviceSpacePoint>& bottomSPs,
0053     vecmem::vector<detail::DeviceSpacePoint>& middleSPs,
0054     vecmem::vector<detail::DeviceSpacePoint>& topSPs,
0055     std::vector<std::vector<detail::SeedData>>& seeds) {
0056   // Each vector stores data of space points in simplified
0057   // structures of float variables
0058   // M: number of middle space points
0059   // B: number of bottom space points
0060   // T: number of top space points
0061   const uint32_t M = middleSPs.size();
0062   const uint32_t B = bottomSPs.size();
0063   const uint32_t T = topSPs.size();
0064
0065   // Up to the Nth space point, the sum of compatible bottom/top space points.
0066   // We need these for indexing other vectors later in the algorithm.
0067   // These are prefix sum arrays, with a leading zero.
0068   // Those will be created with either host or shared memory resource
0069   vecmem::vector<uint32_t> sumBotMidPrefix(&resource);
0070   sumBotMidPrefix.push_back(0);
0071   vecmem::vector<uint32_t> sumTopMidPrefix(&resource);
0072   sumTopMidPrefix.push_back(0);
0073   vecmem::vector<uint32_t> sumBotTopCombPrefix(&resource);
0074   sumBotTopCombPrefix.push_back(0);
0075
0076   // After completing the duplet search, we'll have successfully constructed
0077   // two bipartite graphs for bottom-middle and top-middle space points.
0078   // We store the indices of the middle space points of the corresponding
0079   // edges.
0080   vecmem::vector<uint32_t> indMidBotComp(&resource);
0081   vecmem::vector<uint32_t> indMidTopComp(&resource);
0082
0083   try {
0084     auto* q = wrappedQueue.getQueue();
0085     uint64_t globalBufferSize =
0086         q->get_device().get_info<cl::sycl::info::device::global_mem_size>();
0087     uint64_t maxWorkGroupSize =
0088         q->get_device().get_info<cl::sycl::info::device::max_work_group_size>();
0089     vecmem::sycl::copy copy(wrappedQueue.getQueue());
0090
0091     // Calculate 2 dimensional range of bottom-middle duplet search kernel
0092     // We'll have a total of M*B threads globally, but we need to give the
0093     // nd_range the global dimensions so that they are an exact multiple of
0094     // the local dimensions. That's why we need this calculation.
0095
0096     cl::sycl::nd_range<2> bottomDupletNDRange =
0097         calculate2DimNDRange(M, B, maxWorkGroupSize);
0098     cl::sycl::nd_range<2> topDupletNDRange =
0099         calculate2DimNDRange(M, T, maxWorkGroupSize);
0100
0101     // Create views of the space point vectors.
0102     // They will be constructed differently depending on the number of memory
0103     // resources given.
0104     std::unique_ptr<vecmem::data::vector_buffer<detail::DeviceSpacePoint>>
0105         deviceBottomSPs, deviceTopSPs, deviceMiddleSPs;
0106     vecmem::data::vector_view<detail::DeviceSpacePoint> bottomSPsView,
0107         topSPsView, middleSPsView;
0108     if (!device_resource) {
0109       bottomSPsView = vecmem::get_data(bottomSPs);
0110       topSPsView = vecmem::get_data(topSPs);
0111       middleSPsView = vecmem::get_data(middleSPs);
0112     } else {
0113       deviceBottomSPs = std::make_unique<
0114           vecmem::data::vector_buffer<detail::DeviceSpacePoint>>(
0115           B, *device_resource);
0116       deviceTopSPs = std::make_unique<
0117           vecmem::data::vector_buffer<detail::DeviceSpacePoint>>(
0118           T, *device_resource);
0119       deviceMiddleSPs = std::make_unique<
0120           vecmem::data::vector_buffer<detail::DeviceSpacePoint>>(
0121           M, *device_resource);
0122
0123       copy(vecmem::get_data(bottomSPs), *deviceBottomSPs);
0124       copy(vecmem::get_data(topSPs), *deviceTopSPs);
0125       copy(vecmem::get_data(middleSPs), *deviceMiddleSPs);
0126
0127       bottomSPsView = vecmem::get_data(*deviceBottomSPs);
0128       topSPsView = vecmem::get_data(*deviceTopSPs);
0129       middleSPsView = vecmem::get_data(*deviceMiddleSPs);
0130     }
0131     //*********************************************//
0132     // ********** DUPLET SEARCH - BEGIN ********** //
0133     //*********************************************//
0134
0135     // Create the output data of the duplet search - jagged vectors.
0136     std::unique_ptr<vecmem::data::jagged_vector_buffer<uint32_t>>
0137         midBotDupletBuffer;
0138     std::unique_ptr<vecmem::data::jagged_vector_buffer<uint32_t>>
0139         midTopDupletBuffer;
0140
0141     midBotDupletBuffer =
0142         std::make_unique<vecmem::data::jagged_vector_buffer<uint32_t>>(
0143             std::vector<std::size_t>(M, 0), std::vector<std::size_t>(M, B),
0144             (device_resource ? *device_resource : resource),
0145             (device_resource ? &resource : nullptr));
0146     midTopDupletBuffer =
0147         std::make_unique<vecmem::data::jagged_vector_buffer<uint32_t>>(
0148             std::vector<std::size_t>(M, 0), std::vector<std::size_t>(M, T),
0149             (device_resource ? *device_resource : resource),
0150             (device_resource ? &resource : nullptr));
0151     copy.setup(*midBotDupletBuffer);
0152     copy.setup(*midTopDupletBuffer);
0153
0154     // Perform the middle-bottom duplet search.
0155     auto middleBottomEvent = q->submit([&](cl::sycl::handler& h) {
0156       detail::DupletSearch<detail::SpacePointType::Bottom> kernel(
0157           middleSPsView, bottomSPsView, *midBotDupletBuffer, seedFinderConfig);
0158       h.parallel_for<class DupletSearchBottomKernel>(bottomDupletNDRange,
0159                                                      kernel);
0160     });
0161
0162     // Perform the middle-top duplet search.
0163     auto middleTopEvent = q->submit([&](cl::sycl::handler& h) {
0164       detail::DupletSearch<detail::SpacePointType::Top> kernel(
0165           middleSPsView, topSPsView, *midTopDupletBuffer, seedFinderConfig);
0166       h.parallel_for<class DupletSearchTopKernel>(topDupletNDRange, kernel);
0167     });
0168     middleBottomEvent.wait_and_throw();
0169     middleTopEvent.wait_and_throw();
0170     //*********************************************//
0171     // *********** DUPLET SEARCH - END *********** //
0172     //*********************************************//
0173
0174     // Get the sizes of the inner vectors of the jagged vector - number of
0175     // compatible bottom/top SPs for each MiddleSP.
0176     auto countBotDuplets = copy.get_sizes(*midBotDupletBuffer);
0177     auto countTopDuplets = copy.get_sizes(*midTopDupletBuffer);
0178     // Construct prefix sum arrays of duplet counts.
0179     // These will later be used to index other arrays based on middle SP
0180     // indices.
0181     for (uint32_t i = 1; i < M + 1; ++i) {
0182       sumBotMidPrefix.push_back(sumBotMidPrefix.at(i - 1) +
0183                                 countBotDuplets[i - 1]);
0184       sumTopMidPrefix.push_back(sumTopMidPrefix.at(i - 1) +
0185                                 countTopDuplets[i - 1]);
0186       sumBotTopCombPrefix.push_back(sumBotTopCombPrefix.at(i - 1) +
0187                                     countBotDuplets[i - 1] *
0188                                         countTopDuplets[i - 1]);
0189     }
0190     // Number of edges for middle-bottom and middle-top duplet bipartite graphs.
0191     const uint64_t edgesBottom = sumBotMidPrefix[M];
0192     const uint64_t edgesTop = sumTopMidPrefix[M];
0193     // Number of possible compatible triplets. This is the sum of the
0194     // combination of the number of compatible bottom and compatible top duplets
0195     // per middle space point. (nb0*nt0 + nb1*nt1 + ... where nbk is the number
0196     // of comp. bot. SPs for the kth middle SP)
0197     const uint64_t edgesComb = sumBotTopCombPrefix[M];
0198
0199     indMidBotComp.reserve(edgesBottom);
0200     indMidTopComp.reserve(edgesTop);
0201
0202     // Fill arrays of middle SP indices of found duplets (bottom and top).
0203     for (uint32_t mid = 0; mid < M; ++mid) {
0204       std::fill_n(std::back_inserter(indMidBotComp), countBotDuplets[mid], mid);
0205       std::fill_n(std::back_inserter(indMidTopComp), countTopDuplets[mid], mid);
0206     }
0207
0208     if (edgesBottom > 0 && edgesTop > 0) {
0209       // Calculate global and local range of execution for edgesBottom number of
0210       // threads. Local range is the same as block size in CUDA.
0211       cl::sycl::nd_range<1> edgesBotNdRange =
0212           calculate1DimNDRange(edgesBottom, maxWorkGroupSize);
0213
0214       // Global and local range of execution for edgesTop number of threads.
0215       cl::sycl::nd_range<1> edgesTopNdRange =
0216           calculate1DimNDRange(edgesTop, maxWorkGroupSize);
0217
0218       // EXPLANATION OF INDEXING (first part)
0219       /*
0220         (for bottom-middle duplets, but it is the same for middle-tops)
0221         In case we have 4 middle SP and 5 bottom SP, our temporary array of
0222         the compatible bottom duplet indices would look like this:
0223              ---------------------
0224         mid0 | 0 | 3 | 4 | 1 | - |    Indices in the columns correspond to
0225         mid1 | 3 | 2 | - | - | - |    bottom SP indices in the bottomSPs
0226         mid2 | - | - | - | - | - |    array. Threads are executed concurrently,
0227         mid3 | 4 | 2 | 1 | - | - |    so the order of indices is random.
0228              ---------------------
0229         We will refer to this structure as a bipartite graph, as it can be
0230         described by a graph of nodes for middle and bottom SPs, and edges
0231         between one middle and one bottom SP, but never two middle or two
0232         bottom SPs.
0233         We will flatten this matrix out, and store the indices the
0234         following way (this is indBotDupletBuffer):
0235         -------------------------------------
0236         | 0 | 3 | 4 | 1 | 3 | 2 | 4 | 2 | 1 |
0237         -------------------------------------
0238         Also the length of this array is equal to edgesBottom, which is 9 in
0239         this example. It is the number of the edges of the bottom-middle
0240         bipartite graph.
0241         To find out where the indices of bottom SPs start for a particular
0242         middle SP, we use prefix sum arrays.
0243         We know how many duplets were found for each middle SP (this is
0244         countBotDuplets).
0245         -----------------
0246         | 4 | 2 | 0 | 3 |
0247         -----------------
0248         We will make a prefix sum array of these counts, with a leading zero:
0249         (this is sumBotMidPrefix)
0250         ---------------------
0251         | 0 | 4 | 6 | 6 | 9 |
0252         ---------------------
0253         If we have the middle SP with index 1, then we know that the indices
0254         of the compatible bottom SPs are in the range (left closed, right
0255         open) [sumBotMidPrefix[1] , sumBotMidPrefix[2] ) of indBotDUpletBuffer.
0256         In this case, these indices are 3 and 2, so we'd use these to index
0257         views of bottomSPs to gather data about the bottom SP.
0258         To be able to get the indices of middle SPs in constant time inside
0259         kernels, we will also prepare arrays that store the indices of the
0260         middleSPs of the edges (indMidBotComp).
0261         -------------------------------------
0262         | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 3 | 3 |
0263         -------------------------------------
0264         (For the same purpose, we could also do a binary search on the
0265         sumBotMidPrefix array, and we will do exactly that later, in the triplet
0266         search kernel.)
0267         We will execute the coordinate transformation on edgesBottom threads,
0268         or 9 in our example.
0269         The size of the array storing our transformed coordinates
0270         (linearBotBuffer) is also edgesBottom, the sum of bottom duplets we
0271         found so far.
0272       */
0273
0274       // We store the indices of the BOTTOM/TOP space points of the edges of
0275       // the bottom-middle and top-middle bipartite duplet graphs. They index
0276       // the bottomSPs and topSPs vectors.
0277
0278       // We store the indices of the MIDDLE space points of the edges of the
0279       // bottom-middle and top-middle bipartite duplet graphs.
0280       // They index the middleSP vector.
0281       // indMidBotComp;
0282       // indMidTopComp;
0283
0284       // Partial sum arrays of deviceNumBot and deviceNum
0285       // Partial sum array of the combinations of compatible bottom and top
0286       // space points per middle space point.
0287       // Allocations for coordinate transformation.
0288
0289       // Buffers for Flattening the jagged vectors
0290       std::unique_ptr<vecmem::data::vector_buffer<uint32_t>> indBotDupletBuffer;
0291       std::unique_ptr<vecmem::data::vector_buffer<uint32_t>> indTopDupletBuffer;
0292
0293       indBotDupletBuffer =
0294           std::make_unique<vecmem::data::vector_buffer<uint32_t>>(
0295               edgesBottom, (device_resource ? *device_resource : resource));
0296       indTopDupletBuffer =
0297           std::make_unique<vecmem::data::vector_buffer<uint32_t>>(
0298               edgesTop, (device_resource ? *device_resource : resource));
0299
0300       copy.setup(*indBotDupletBuffer);
0301       copy.setup(*indTopDupletBuffer);
0302
0303       // Pointers constructed in case the device memory resource was given.
0304       std::unique_ptr<vecmem::data::vector_buffer<uint32_t>>
0305           device_sumBotMidPrefix, device_sumTopMidPrefix,
0306           device_sumBotTopCombPrefix;
0307       // Vecmem views of the prefix sums used throughout the later code.
0308       vecmem::data::vector_view<uint32_t> sumBotMidView, sumTopMidView,
0309           sumBotTopCombView;
0310
0311       // Same behaviour for the vectors of indices
0312       std::unique_ptr<vecmem::data::vector_buffer<uint32_t>>
0313           device_indMidBotComp, device_indMidTopComp;
0314       vecmem::data::vector_view<uint32_t> indMidBotCompView, indMidTopCompView;
0315       // Copy indices from temporary matrices to final, optimal size vectors.
0316       // We will use these for easier indexing.
0317       if (!device_resource) {
0318         sumBotMidView = vecmem::get_data(sumBotMidPrefix);
0319         sumTopMidView = vecmem::get_data(sumTopMidPrefix);
0320         sumBotTopCombView = vecmem::get_data(sumBotTopCombPrefix);
0321
0322         indMidBotCompView = vecmem::get_data(indMidBotComp);
0323         indMidTopCompView = vecmem::get_data(indMidTopComp);
0324       } else {
0325         device_sumBotMidPrefix =
0326             std::make_unique<vecmem::data::vector_buffer<uint32_t>>(
0327                 M + 1, *device_resource);
0328         device_sumTopMidPrefix =
0329             std::make_unique<vecmem::data::vector_buffer<uint32_t>>(
0330                 M + 1, *device_resource);
0331         device_sumBotTopCombPrefix =
0332             std::make_unique<vecmem::data::vector_buffer<uint32_t>>(
0333                 M + 1, *device_resource);
0334
0335         device_indMidBotComp =
0336             std::make_unique<vecmem::data::vector_buffer<uint32_t>>(
0337                 edgesBottom, *device_resource);
0338         device_indMidTopComp =
0339             std::make_unique<vecmem::data::vector_buffer<uint32_t>>(
0340                 edgesTop, *device_resource);
0341
0342         copy(vecmem::get_data(sumBotMidPrefix), *device_sumBotMidPrefix);
0343         copy(vecmem::get_data(sumTopMidPrefix), *device_sumTopMidPrefix);
0344         copy(vecmem::get_data(sumBotTopCombPrefix),
0345              *device_sumBotTopCombPrefix);
0346
0347         copy(vecmem::get_data(indMidBotComp), *device_indMidBotComp);
0348         copy(vecmem::get_data(indMidTopComp), *device_indMidTopComp);
0349
0350         sumBotMidView = vecmem::get_data(*device_sumBotMidPrefix);
0351         sumTopMidView = vecmem::get_data(*device_sumTopMidPrefix);
0352         sumBotTopCombView = vecmem::get_data(*device_sumBotTopCombPrefix);
0353
0354         indMidBotCompView = vecmem::get_data(*device_indMidBotComp);
0355         indMidTopCompView = vecmem::get_data(*device_indMidTopComp);
0356       }
0357       auto midBotDupletView = vecmem::get_data(*midBotDupletBuffer);
0358       auto indBotDupletView = vecmem::get_data(*indBotDupletBuffer);
0359       auto indBotEvent = q->submit([&](cl::sycl::handler& h) {
0360         h.parallel_for<ind_copy_bottom_kernel>(
0361             edgesBotNdRange, [=](cl::sycl::nd_item<1> item) {
0362               auto idx = item.get_global_linear_id();
0363               if (idx < edgesBottom) {
0364                 vecmem::device_vector<uint32_t> deviceIndMidBot(
0365                     indMidBotCompView),
0366                     sumBotMidPrefix(sumBotMidView),
0367                     indBotDuplets(indBotDupletView);
0368                 vecmem::jagged_device_vector<const uint32_t> midBotDuplets(
0369                     midBotDupletView);
0370                 auto mid = deviceIndMidBot[idx];
0371                 auto ind = midBotDuplets[mid][idx - sumBotMidPrefix[mid]];
0372                 indBotDuplets[idx] = ind;
0373               }
0374             });
0375       });
0376       auto midTopDupletView = vecmem::get_data(*midTopDupletBuffer);
0377       auto indTopDupletView = vecmem::get_data(*indTopDupletBuffer);
0378       auto indTopEvent = q->submit([&](cl::sycl::handler& h) {
0379         h.parallel_for<ind_copy_top_kernel>(
0380             edgesTopNdRange, [=](cl::sycl::nd_item<1> item) {
0381               auto idx = item.get_global_linear_id();
0382               if (idx < edgesTop) {
0383                 vecmem::device_vector<uint32_t> deviceIndMidTop(
0384                     indMidTopCompView),
0385                     sumTopMidPrefix(sumTopMidView),
0386                     indTopDuplets(indTopDupletView);
0387                 vecmem::jagged_device_vector<const uint32_t> midTopDuplets(
0388                     midTopDupletView);
0389                 auto mid = deviceIndMidTop[idx];
0390                 auto ind = midTopDuplets[mid][idx - sumTopMidPrefix[mid]];
0391                 indTopDuplets[idx] = ind;
0392               }
0393             });
0394       });
0395       indBotEvent.wait_and_throw();
0396       indTopEvent.wait_and_throw();
0397
0398       // Create the output data of the linear transform
0399       std::unique_ptr<vecmem::data::vector_buffer<detail::DeviceLinEqCircle>>
0400           linearBotBuffer;
0401       std::unique_ptr<vecmem::data::vector_buffer<detail::DeviceLinEqCircle>>
0402           linearTopBuffer;
0403
0404       linearBotBuffer = std::make_unique<
0405           vecmem::data::vector_buffer<detail::DeviceLinEqCircle>>(
0406           edgesBottom, (device_resource ? *device_resource : resource));
0407       linearTopBuffer = std::make_unique<
0408           vecmem::data::vector_buffer<detail::DeviceLinEqCircle>>(
0409           edgesTop, (device_resource ? *device_resource : resource));
0410
0411       copy.setup(*linearBotBuffer);
0412       copy.setup(*linearTopBuffer);
0413
0414       //************************************************//
0415       // *** LINEAR EQUATION TRANSFORMATION - BEGIN *** //
0416       //************************************************//
0417
0418       // transformation of circle equation (x,y) into linear equation (u,v)
0419       // x^2 + y^2 - 2x_0*x - 2y_0*y = 0
0420       // is transformed into
0421       // 1 - 2x_0*u - 2y_0*v = 0
0422
0423       // coordinate transformation middle-bottom pairs
0424       auto linB = q->submit([&](cl::sycl::handler& h) {
0425         detail::LinearTransform<detail::SpacePointType::Bottom> kernel(
0426             middleSPsView, bottomSPsView, indMidBotCompView,
0427             *indBotDupletBuffer, edgesBottom, *linearBotBuffer);
0428         h.parallel_for<class TransformCoordBottomKernel>(edgesBotNdRange,
0429                                                          kernel);
0430       });
0431
0432       // coordinate transformation middle-top pairs
0433       auto linT = q->submit([&](cl::sycl::handler& h) {
0434         detail::LinearTransform<detail::SpacePointType::Top> kernel(
0435             middleSPsView, topSPsView, indMidTopCompView, *indTopDupletBuffer,
0436             edgesTop, *linearTopBuffer);
0437         h.parallel_for<class TransformCoordTopKernel>(edgesTopNdRange, kernel);
0438       });
0439
0440       //************************************************//
0441       // **** LINEAR EQUATION TRANSFORMATION - END **** //
0442       //************************************************//
0443
0444       //************************************************//
0445       // *********** TRIPLET SEARCH - BEGIN *********** //
0446       //************************************************//
0447
0448       // EXPLANATION OF INDEXING (second part)
0449       /*
0450         For the triplet search, we calculate the upper limit of constructible
0451         triplets.
0452
0453         For this, we multiply the number of compatible bottom and compatible
0454         top SPs for each middle SP, and add these together.
0455         (nb0*nt0 + nb1*nt1 + ... where nbk is the number of compatible bottom
0456         SPs for the kth middle SP, similarly ntb is for tops)
0457
0458         sumBotTopCombPrefix is a prefix sum array (of length M+1) of the
0459         calculated combinations.
0460
0461         sumBotTopCombPrefix:
0462         ________________________________________________________
0463         |     |         |                   |     |  M         | M = number
0464         |  0  | nb0*nt0 | nb0*nt0 + nb1*nt1 | ... |  ∑ nbi+nti | of middle
0465         |_____|_________|___________________|_____|_i=0________| space points
0466
0467         We will start kernels and reserve memory for these combinations but
0468         only so much we can fit into memory at once.
0469
0470         We limit our memory usage to globalBufferSize/2, this is currently
0471         hard-coded, but it could be configured. Actually, it would be better
0472         to use a separate object that manages memory allocations and
0473         deallocations and we could ask it to lend us as much memory as it is
0474         happy to give.
0475
0476         For later, let maxMemoryAllocation be maximum allocatable memory for
0477         triplet search.
0478
0479         We start by adding up summing the combinations, until we arrive at a
0480         k which for:
0481
0482         k+1
0483          ∑ nbi+nti > maxMemoryAllocation
0484         i=0
0485         (or k == M).
0486
0487         So we know, that we need to start our first kernel for the first k
0488         middle SPs.
0489
0490         Inside the triplet search kernel we start with a binary search, to
0491         find out which middle SP the thread corresponds to. Note, that
0492         sumBotTopCombPrefix is a monotone increasing series of values which
0493         allows us to do a binary search on it.
0494
0495         Inside the triplet search kernel we count the triplets for fixed
0496         bottom and middle SP. This is deviceCountTriplets.
0497
0498         The triplet filter kernel is calculated on threads equal to all possible
0499         bottom-middle combinations for the first k middle SPs, which are
0500         the sum of bottom-middle duplets. (For the next kernel it would be the
0501         bottom-middle combinations from the (k+1)th middle SP to another jth
0502         middle SP j<=M.)
0503
0504         This will be numTripletFilterThreads =
0505             sumBotMidPrefix[lastMiddle] - sumBotMidPrefix[firstMiddle]
0506
0507         If the triplet search and triplet filter kernel finished, we continue
0508         summing up possible triplet combinations from the (k+1)th middle SP.
0509
0510         Inside the kernels we need to use offset because of this, to be able to
0511         map threads to space point indices.
0512
0513         This offset is sumCombUptoFirstMiddle.
0514       */
0515
0516       const auto maxMemoryAllocation =
0517           std::min(edgesComb,
0518                    globalBufferSize / uint64_t((sizeof(detail::DeviceTriplet) +
0519                                                 sizeof(detail::SeedData)) *
0520                                                2));
0521
0522       std::unique_ptr<vecmem::data::vector_buffer<detail::DeviceTriplet>>
0523           curvImpactBuffer;
0524       std::unique_ptr<vecmem::data::vector_buffer<detail::SeedData>>
0525           seedArrayBuffer;
0526
0527       curvImpactBuffer =
0528           std::make_unique<vecmem::data::vector_buffer<detail::DeviceTriplet>>(
0529               maxMemoryAllocation,
0530               (device_resource ? *device_resource : resource));
0531       seedArrayBuffer =
0532           std::make_unique<vecmem::data::vector_buffer<detail::SeedData>>(
0533               maxMemoryAllocation, 0,
0534               (device_resource ? *device_resource : resource));
0535
0536       copy.setup(*curvImpactBuffer);
0537       copy.setup(*seedArrayBuffer);
0538       // Reserve memory in advance for seed indices and weight
0539       // Other way around would allocate it inside the loop
0540       // -> less memory usage, but more frequent allocation and deallocation
0541
0542       // Counting the seeds in the second kernel allows us to copy back the
0543       // right number of seeds, and no more.
0544       seeds.resize(M);
0545       vecmem::vector<uint32_t> countTriplets(&resource);
0546       countTriplets.resize(edgesBottom, 0);
0547
0548       std::unique_ptr<vecmem::data::vector_buffer<uint32_t>>
0549           deviceCountTriplets;
0550       vecmem::data::vector_view<uint32_t> countTripletsView;
0551
0552       if (!device_resource) {
0553         countTripletsView = vecmem::get_data(countTriplets);
0554       } else {
0555         deviceCountTriplets =
0556             std::make_unique<vecmem::data::vector_buffer<uint32_t>>(
0557                 edgesBottom, *device_resource);
0558         copy(vecmem::get_data(countTriplets), *deviceCountTriplets);
0559         countTripletsView = vecmem::get_data(*deviceCountTriplets);
0560       }
0561
0562       // Do the triplet search and triplet filter for 2 sp fixed for middle
0563       // space points in the interval [firstMiddle, lastMiddle).
0564
0565       uint32_t lastMiddle = 0;
0566       for (uint32_t firstMiddle = 0; firstMiddle < M;
0567            firstMiddle = lastMiddle) {
0568         // Determine the interval [firstMiddle, lastMiddle) right end based
0569         // on memory requirements.
0570         while (lastMiddle + 1 <= M && (sumBotTopCombPrefix[lastMiddle + 1] -
0571                                            sumBotTopCombPrefix[firstMiddle] <
0572                                        maxMemoryAllocation)) {
0573           ++lastMiddle;
0574         }
0575
0576         const auto numTripletSearchThreads =
0577             sumBotTopCombPrefix[lastMiddle] - sumBotTopCombPrefix[firstMiddle];
0578
0579         if (numTripletSearchThreads == 0) {
0580           ++lastMiddle;
0581           continue;
0582         }
0583
0584         copy.setup(*seedArrayBuffer);
0585         const auto numTripletFilterThreads =
0586             sumBotMidPrefix[lastMiddle] - sumBotMidPrefix[firstMiddle];
0587
0588         // Nd_range with maximum block size for triplet search and filter.
0589         // (global and local range is already given)
0590         cl::sycl::nd_range<1> tripletSearchNDRange =
0591             calculate1DimNDRange(numTripletSearchThreads, maxWorkGroupSize);
0592
0593         cl::sycl::nd_range<1> tripletFilterNDRange =
0594             calculate1DimNDRange(numTripletFilterThreads, maxWorkGroupSize);
0595
0596         auto tripletKernel = q->submit([&](cl::sycl::handler& h) {
0597           h.depends_on({linB, linT});
0598           detail::TripletSearch kernel(
0599               sumBotTopCombView, numTripletSearchThreads, firstMiddle,
0600               lastMiddle, *midTopDupletBuffer, sumBotMidView, sumTopMidView,
0601               *linearBotBuffer, *linearTopBuffer, middleSPsView,
0602               *indTopDupletBuffer, countTripletsView, seedFinderConfig,
0603               *curvImpactBuffer);
0604           h.parallel_for<class triplet_search_kernel>(tripletSearchNDRange,
0605                                                       kernel);
0606         });
0607
0608         q->submit([&](cl::sycl::handler& h) {
0609            h.depends_on(tripletKernel);
0610            detail::TripletFilter kernel(
0611                numTripletFilterThreads, sumBotMidView, firstMiddle,
0612                indMidBotCompView, *indBotDupletBuffer, sumBotTopCombView,
0613                *midTopDupletBuffer, *curvImpactBuffer, topSPsView,
0614                middleSPsView, bottomSPsView, countTripletsView,
0615                *seedArrayBuffer, seedFinderConfig, deviceCuts);
0616            h.parallel_for<class filter_2sp_fixed_kernel>(tripletFilterNDRange,
0617                                                          kernel);
0618          }).wait_and_throw();
0619         // sync
0620         // Retrieve results from triplet search
0621         std::vector<detail::SeedData> seedArray;
0622         copy(*seedArrayBuffer, seedArray);
0623
0624         for (auto& t : seedArray) {
0625           seeds[t.middle].push_back(t);
0626         }
0627       }
0628
0629       //************************************************//
0630       // ************ TRIPLET SEARCH - END ************ //
0631       //************************************************//
0632     }
0633
0634   } catch (cl::sycl::exception const& e) {
0635     ACTS_LOCAL_LOGGER(
0636         Acts::getDefaultLogger("SyclSeeding", Acts::Logging::INFO));
0637     ACTS_FATAL("Caught synchronous SYCL exception:\n" << e.what())
0638     throw;
0639   }
0640 };
0641 }  // namespace Acts::Sycl