summaryrefslogtreecommitdiff
path: root/ext/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp
blob: 3a87dff9c2bcde26a68c9fcb3c493074cbe6a8af (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 4.0
//       Copyright (2022) National Technology & Engineering
//               Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#pragma once

#include <array>
#include <type_traits>
#include <utility> // index_sequence
#include "../__p0009_bits/utility.hpp"

// Suppress spurious warning with NVCC about no return statement.
// This is a known issue in NVCC and NVC++
// Depending on the CUDA and GCC version we need both the builtin
// and the diagnostic push. I tried really hard to find something shorter
// but no luck ...
#if defined __NVCC__
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diagnostic push
#pragma nv_diag_suppress = implicit_return_from_non_void_function
#else
#ifdef __CUDA_ARCH__
#pragma diagnostic push
#pragma diag_suppress implicit_return_from_non_void_function
#endif
#endif
#elif defined __NVCOMPILER
#pragma diagnostic push
#pragma diag_suppress = implicit_return_from_non_void_function
#endif

namespace MDSPAN_IMPL_STANDARD_NAMESPACE {
//******************************************
// Return type of submdspan_mapping overloads
//******************************************
template <class LayoutMapping> struct submdspan_mapping_result {
  MDSPAN_IMPL_NO_UNIQUE_ADDRESS LayoutMapping mapping{};
  size_t offset;
};

namespace detail {

// We use const Slice& and not Slice&& because the various
// submdspan_mapping_impl overloads use their slices arguments
// multiple times.  This makes perfect forwarding not useful, but we
// still don't want to pass those (possibly of size 64 x 3 bits)
// objects by value.
template <class IndexType, class Slice>
MDSPAN_INLINE_FUNCTION constexpr bool
one_slice_out_of_bounds(const IndexType &ext, const Slice &slice) {
  using common_t =
      std::common_type_t<decltype(detail::first_of(slice)), IndexType>;
  return static_cast<common_t>(detail::first_of(slice)) ==
         static_cast<common_t>(ext);
}

template <size_t... RankIndices, class IndexType, size_t... Exts,
          class... Slices>
MDSPAN_INLINE_FUNCTION constexpr bool
any_slice_out_of_bounds_helper(std::index_sequence<RankIndices...>,
                               const extents<IndexType, Exts...> &exts,
                               const Slices &... slices) {
  return MDSPAN_IMPL_FOLD_OR(
      (one_slice_out_of_bounds(exts.extent(RankIndices), slices)));
}

template <class IndexType, size_t... Exts, class... Slices>
MDSPAN_INLINE_FUNCTION constexpr bool
any_slice_out_of_bounds(const extents<IndexType, Exts...> &exts,
                        const Slices &... slices) {
  return any_slice_out_of_bounds_helper(
      std::make_index_sequence<sizeof...(Slices)>(), exts, slices...);
}

// constructs sub strides
template<class T, size_t N>
struct sub_strides
{
  T values[N > 0 ? N : 1];
};

template <class SrcMapping, class... slice_strides, size_t... InvMapIdxs>
MDSPAN_INLINE_FUNCTION constexpr auto construct_sub_strides(
    const SrcMapping &src_mapping, std::index_sequence<InvMapIdxs...>,
    const MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<slice_strides...> &slices_stride_factor) {
  using index_type = typename SrcMapping::index_type;
  return sub_strides<typename SrcMapping::index_type, sizeof...(InvMapIdxs)>{{
      (static_cast<index_type>(src_mapping.stride(InvMapIdxs)) *
       static_cast<index_type>(get<InvMapIdxs>(slices_stride_factor)))...}};
}

template<class SliceSpecifier, class IndexType>
struct is_range_slice {
  constexpr static bool value =
    std::is_same_v<SliceSpecifier, full_extent_t> ||
    index_pair_like<SliceSpecifier, IndexType>::value;
};

template<class SliceSpecifier, class IndexType>
constexpr bool is_range_slice_v = is_range_slice<SliceSpecifier, IndexType>::value;

template<class SliceSpecifier, class IndexType>
struct is_index_slice {
  constexpr static bool value = std::is_convertible_v<SliceSpecifier, IndexType>;
};

template<class SliceSpecifier, class IndexType>
constexpr bool is_index_slice_v = is_index_slice<SliceSpecifier, IndexType>::value;

} // namespace detail

//**********************************
// layout_left submdspan_mapping
//*********************************
namespace detail {

// Figure out whether to preserve layout_left
template <class IndexType, size_t SubRank, class IndexSequence,
          class... SliceSpecifiers>
struct deduce_layout_left_submapping;

template <class IndexType, size_t SubRank, size_t... Idx,
          class... SliceSpecifiers>
struct deduce_layout_left_submapping<
    IndexType, SubRank, std::index_sequence<Idx...>, SliceSpecifiers...> {

  using count_range = index_sequence_scan_impl<
      0u, (is_index_slice_v<SliceSpecifiers, IndexType> ? 0u : 1u)...>;

  constexpr static int gap_len =
      (((Idx > 0 && count_range::get(Idx) == 1 &&
         is_index_slice_v<SliceSpecifiers, IndexType>)
            ? 1
            : 0) +
       ... + 0);

  MDSPAN_INLINE_FUNCTION
  constexpr static bool layout_left_value() {
    // Use layout_left for rank 0
    if constexpr (SubRank == 0) {
      return true;
    // Use layout_left for rank 1 result if leftmost slice specifier is range like
    } else if constexpr (SubRank == 1) {
      return ((Idx > 0 || is_range_slice_v<SliceSpecifiers, IndexType>)&&...);
    } else {
      // Preserve if leftmost SubRank-1 slices are full_extent_t and
      // the slice at idx Subrank - 1 is a range and
      // for idx > SubRank the slice is an index
      return ((((Idx <  SubRank - 1) && std::is_same_v<SliceSpecifiers, full_extent_t>) ||
               ((Idx == SubRank - 1) && is_range_slice_v<SliceSpecifiers, IndexType>) ||
               ((Idx >  SubRank - 1) && is_index_slice_v<SliceSpecifiers, IndexType>)) && ...);
    }
#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
    __builtin_unreachable();
#endif
  }

  MDSPAN_INLINE_FUNCTION
  constexpr static bool layout_left_padded_value() {
    // Technically could also keep layout_left_padded for SubRank==0
    // and SubRank==1 with leftmost slice specifier being a contiguous range
    // but we intercept these cases separately

    // In all other cases:
    // leftmost slice must be range
    // then there can be a gap with index slices
    // then SubRank - 2 full_extent slices
    // then another range slice
    // then more index slices
    // e.g. R I I I F F F R I I for obtaining a rank-5 from a rank-10
    return ((((Idx == 0)                                       && is_range_slice_v<SliceSpecifiers, IndexType>) ||
             ((Idx > 0 && Idx <= gap_len)                     && is_index_slice_v<SliceSpecifiers, IndexType>) ||
             ((Idx > gap_len && Idx < gap_len + SubRank - 1) && std::is_same_v<SliceSpecifiers, full_extent_t>) ||
             ((Idx == gap_len + SubRank - 1)                  && is_range_slice_v<SliceSpecifiers, IndexType>) ||
             ((Idx >  gap_len + SubRank - 1)                  && is_index_slice_v<SliceSpecifiers, IndexType>)) && ... );
  }
};

// We are reusing the same thing for layout_left and layout_left_padded
// For layout_left as source StaticStride is static_extent(0)
template<class Extents, size_t NumGaps, size_t StaticStride, size_t... Idx>
MDSPAN_INLINE_FUNCTION constexpr size_t
compute_s_static_layout_left(std::index_sequence<Idx...>) {
  // Neither StaticStride nor any of the provided extents can be zero.
  // StaticStride can never be zero, the static_extents we are looking at are associated with
  // integral slice specifiers - which wouldn't be valid for zero extent
    size_t val = ((Idx>0 && Idx<=NumGaps ? (Extents::static_extent(Idx) == dynamic_extent?0:Extents::static_extent(Idx)) : 1) * ... * (StaticStride == dynamic_extent?0:StaticStride));
    return val == 0?dynamic_extent:val;
  }

} // namespace detail

// Actual submdspan mapping call
template <class Extents>
template <class... SliceSpecifiers>
MDSPAN_INLINE_FUNCTION constexpr auto
layout_left::mapping<Extents>::submdspan_mapping_impl(
    SliceSpecifiers... slices) const {

  // compute sub extents
  using src_ext_t = Extents;
  auto dst_ext = submdspan_extents(extents(), slices...);
  using dst_ext_t = decltype(dst_ext);

  // figure out sub layout type
  using deduce_layout = detail::deduce_layout_left_submapping<
      typename dst_ext_t::index_type, dst_ext_t::rank(),
      std::make_index_sequence<src_ext_t::rank()>,
      SliceSpecifiers...>;

  // Figure out if any slice's lower bound equals the corresponding extent.
  // If so, bypass evaluating the layout mapping.  This fixes LWG Issue 4060.
  const bool out_of_bounds =
      detail::any_slice_out_of_bounds(this->extents(), slices...);
  auto offset = static_cast<size_t>(
      out_of_bounds ? this->required_span_size()
                    : this->operator()(detail::first_of(slices)...));

  if constexpr (deduce_layout::layout_left_value()) {
    // layout_left case
    using dst_mapping_t = typename layout_left::template mapping<dst_ext_t>;
    return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t(dst_ext),
                                                   offset};
  } else if constexpr (deduce_layout::layout_left_padded_value()) {
    constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left<Extents, deduce_layout::gap_len, Extents::static_extent(0)>(std::make_index_sequence<Extents::rank()>());
    using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded<S_static>::template mapping<dst_ext_t>;
    return submdspan_mapping_result<dst_mapping_t>{
        dst_mapping_t(dst_ext, stride(1 + deduce_layout::gap_len)), offset};
  } else {
    // layout_stride case
    using dst_mapping_t = typename layout_stride::mapping<dst_ext_t>;
    auto inv_map = detail::inv_map_rank(std::integral_constant<size_t, 0>(),
                                        std::index_sequence<>(), slices...);
    return submdspan_mapping_result<dst_mapping_t> {
      dst_mapping_t(mdspan_non_standard, dst_ext,
                    detail::construct_sub_strides(
                        *this, inv_map,
// HIP needs deduction guides to have markups so we need to be explicit
// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have
// the issue but Clang-CUDA also doesn't accept the use of deduction guide so
// disable it for CUDA altogether
#if defined(MDSPAN_IMPL_HAS_HIP) || defined(MDSPAN_IMPL_HAS_CUDA)
                        detail::tuple<decltype(detail::stride_of(slices))...>{
                            detail::stride_of(slices)...}).values),
#else
                        detail::tuple{detail::stride_of(slices)...}).values),
#endif
          offset
    };
  }
#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
  __builtin_unreachable();
#endif
}

template <size_t PaddingValue>
template <class Extents>
template <class... SliceSpecifiers>
MDSPAN_INLINE_FUNCTION constexpr auto
MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded<PaddingValue>::mapping<Extents>::submdspan_mapping_impl(
    SliceSpecifiers... slices) const {

  // compute sub extents
  using src_ext_t = Extents;
  auto dst_ext = submdspan_extents(extents(), slices...);
  using dst_ext_t = decltype(dst_ext);

  if constexpr (Extents::rank() == 0) { // rank-0 case
    using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded<PaddingValue>::template mapping<Extents>;
    return submdspan_mapping_result<dst_mapping_t>{*this, 0};
  } else {
    const bool out_of_bounds =
        MDSPAN_IMPL_STANDARD_NAMESPACE::detail::any_slice_out_of_bounds(this->extents(), slices...);
    auto offset = static_cast<size_t>(
        out_of_bounds ? this->required_span_size()
                    : this->operator()(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::first_of(slices)...));
    if constexpr (dst_ext_t::rank() == 0) { // result rank-0
      // The following for some reasons leads to compiler error later, while not using a typedef works:
      // Compilers: CUDA 11.2 with GCC 9.1
      //
      // using dst_mapping_t = typename layout_left::template mapping<dst_ext_t>;
      // return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t{dst_ext}, offset};
      //
      // Error: submdspan_mapping.hpp:299:23: error: 'dst_mapping_t' does not name a type
      //         299 |         using dst_mapping_t = typename layout_left::template mapping<dst_ext_t>;
      // The same error is given (about dst_mapping_t not naming type) when a different name is used in 299:
      //        using dst_mapping_t2 = typename layout_left::template mapping<dst_ext_t>;

      return submdspan_mapping_result<typename layout_left::template mapping<dst_ext_t>>
             {typename layout_left::template mapping<dst_ext_t>{dst_ext}, offset};
    } else { // general case
      // Figure out if any slice's lower bound equals the corresponding extent.
      // If so, bypass evaluating the layout mapping.  This fixes LWG Issue 4060.
      // figure out sub layout type
      using deduce_layout = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::deduce_layout_left_submapping<
        typename dst_ext_t::index_type, dst_ext_t::rank(),
        decltype(std::make_index_sequence<src_ext_t::rank()>()),
        SliceSpecifiers...>;

      if constexpr (deduce_layout::layout_left_value() && dst_ext_t::rank() == 1) { // getting rank-1 from leftmost
        using dst_mapping_t = typename layout_left::template mapping<dst_ext_t>;
        return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t{dst_ext}, offset};
      } else if constexpr (deduce_layout::layout_left_padded_value()) { // can keep layout_left_padded
        constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left<Extents, deduce_layout::gap_len, static_padding_stride>(std::make_index_sequence<Extents::rank()>());
        using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded<S_static>::template mapping<dst_ext_t>;
        return submdspan_mapping_result<dst_mapping_t>{
        dst_mapping_t(dst_ext, stride(1 + deduce_layout::gap_len)), offset};
      } else { // layout_stride
    auto inv_map = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::inv_map_rank(std::integral_constant<size_t, 0>(),
                                        std::index_sequence<>(), slices...);
      using dst_mapping_t = typename layout_stride::template mapping<dst_ext_t>;
    return submdspan_mapping_result<dst_mapping_t> {
      dst_mapping_t(mdspan_non_standard, dst_ext,
                    MDSPAN_IMPL_STANDARD_NAMESPACE::detail::construct_sub_strides(
                        *this, inv_map,
// HIP needs deduction guides to have markups so we need to be explicit
// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have
// the issue but Clang-CUDA also doesn't accept the use of deduction guide so
// disable it for CUDA alltogether
#if defined(MDSPAN_IMPL_HAS_HIP) || defined(MDSPAN_IMPL_HAS_CUDA)
                        MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<decltype(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices))...>{
                            MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...}).values),
#else
                        MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple{MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...}).values),
#endif
          offset
    };
      }
    }
  }


#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
  __builtin_unreachable();
#endif
}

//**********************************
// layout_right submdspan_mapping
//*********************************
namespace detail {

// Figure out whether to preserve layout_right
template <class IndexType, size_t SubRank, class IndexSequence,
          class... SliceSpecifiers>
struct deduce_layout_right_submapping;

template <class IndexType, size_t SubRank, size_t... Idx,
          class... SliceSpecifiers>
struct deduce_layout_right_submapping<
    IndexType, SubRank, std::index_sequence<Idx...>, SliceSpecifiers...> {

  static constexpr size_t Rank = sizeof...(Idx);
  using count_range = index_sequence_scan_impl<
      0u, (std::is_convertible_v<SliceSpecifiers, IndexType> ? 0u : 1u)...>;
  //__static_partial_sums<!std::is_convertible_v<SliceSpecifiers,
  // IndexType>...>;
  constexpr static int gap_len =
      (((Idx < Rank - 1 && count_range::get(Idx) == SubRank - 1 &&
         std::is_convertible_v<SliceSpecifiers, IndexType>)
            ? 1
            : 0) +
       ... + 0);

  MDSPAN_INLINE_FUNCTION
  constexpr static bool layout_right_value() {
    // Use layout_right for rank 0
    if constexpr (SubRank == 0) {
      return true;
    // Use layout_right for rank 1 result if rightmost slice specifier is range like
    } else if constexpr (SubRank == 1) {
      return ((Idx < Rank - 1 || is_range_slice_v<SliceSpecifiers, IndexType>)&&...);
    } else {
      // Preserve if rightmost SubRank-1 slices are full_extent_t and
      // the slice at idx Rank-Subrank is a range and
      // for idx < Rank - SubRank the slice is an index
      return ((((Idx >= Rank - SubRank) && std::is_same_v<SliceSpecifiers, full_extent_t>) ||
               ((Idx == Rank - SubRank) && is_range_slice_v<SliceSpecifiers, IndexType>) ||
               ((Idx <  Rank - SubRank) && is_index_slice_v<SliceSpecifiers, IndexType>)) && ...);
    }
#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
    __builtin_unreachable();
#endif
  }

  MDSPAN_INLINE_FUNCTION
  constexpr static bool layout_right_padded_value() {
    // Technically could also keep layout_right_padded for SubRank==0
    // and SubRank==1 with rightmost slice specifier being a contiguous range
    // but we intercept these cases separately

    // In all other cases:
    // rightmost slice must be range
    // then there can be a gap with index slices
    // then SubRank - 2 full_extent slices
    // then another range slice
    // then more index slices
    // e.g. I I R F F F I I I R for obtaining a rank-5 from a rank-10
    return ((((Idx == Rank - 1)                                               && is_range_slice_v<SliceSpecifiers, IndexType>) ||
             ((Idx >= Rank - gap_len - 1 && Idx < Rank - 1)                  && is_index_slice_v<SliceSpecifiers, IndexType>) ||
             ((Idx >  Rank - gap_len - SubRank && Idx < Rank - gap_len - 1) && std::is_same_v<SliceSpecifiers, full_extent_t>) ||
             ((Idx == Rank - gap_len - SubRank)                              && is_range_slice_v<SliceSpecifiers, IndexType>) ||
             ((Idx <  Rank - gap_len - SubRank)                              && is_index_slice_v<SliceSpecifiers, IndexType>)) && ... );
  }
};

// We are reusing the same thing for layout_right and layout_right_padded
// For layout_right as source StaticStride is static_extent(Rank-1)
template<class Extents, size_t NumGaps, size_t StaticStride, size_t... Idx>
MDSPAN_INLINE_FUNCTION constexpr size_t
compute_s_static_layout_right (std::index_sequence<Idx...>) {
  // Neither StaticStride nor any of the provided extents can be zero.
  // StaticStride can never be zero, the static_extents we are looking at are associated with
  // integral slice specifiers - which wouldn't be valid for zero extent
    size_t val = ((Idx >= Extents::rank() - 1 - NumGaps && Idx < Extents::rank() - 1 ? (Extents::static_extent(Idx) == dynamic_extent?0:Extents::static_extent(Idx)) : 1) * ... * (StaticStride == dynamic_extent?0:StaticStride));
    return val == 0?dynamic_extent:val;
  }

} // namespace detail

// Actual submdspan mapping call
template <class Extents>
template <class... SliceSpecifiers>
MDSPAN_INLINE_FUNCTION constexpr auto
layout_right::mapping<Extents>::submdspan_mapping_impl(
    SliceSpecifiers... slices) const {

  // compute sub extents
  using src_ext_t = Extents;
  auto dst_ext = submdspan_extents(extents(), slices...);
  using dst_ext_t = decltype(dst_ext);

  // figure out sub layout type
  using deduce_layout = detail::deduce_layout_right_submapping<
      typename dst_ext_t::index_type, dst_ext_t::rank(),
      std::make_index_sequence<src_ext_t::rank()>,
      SliceSpecifiers...>;

  // Figure out if any slice's lower bound equals the corresponding extent.
  // If so, bypass evaluating the layout mapping.  This fixes LWG Issue 4060.
  const bool out_of_bounds =
      detail::any_slice_out_of_bounds(this->extents(), slices...);
  auto offset = static_cast<size_t>(
      out_of_bounds ? this->required_span_size()
                    : this->operator()(detail::first_of(slices)...));

  if constexpr (deduce_layout::layout_right_value()) {
    // layout_right case
    using dst_mapping_t = typename layout_right::mapping<dst_ext_t>;
    return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t(dst_ext),
                                                   offset};
  } else if constexpr (deduce_layout::layout_right_padded_value()) {
    constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left<Extents, deduce_layout::gap_len, Extents::static_extent(Extents::rank() - 1)>(std::make_index_sequence<Extents::rank()>());
    using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded<S_static>::template mapping<dst_ext_t>;
    return submdspan_mapping_result<dst_mapping_t>{
        dst_mapping_t(dst_ext,
                      stride(src_ext_t::rank() - 2 - deduce_layout::gap_len)),
        offset};
  } else {
    // layout_stride case
    using dst_mapping_t = typename layout_stride::mapping<dst_ext_t>;
    auto inv_map = detail::inv_map_rank(std::integral_constant<size_t, 0>(),
                                        std::index_sequence<>(), slices...);
    return submdspan_mapping_result<dst_mapping_t> {
      dst_mapping_t(mdspan_non_standard, dst_ext,
                    detail::construct_sub_strides(
                        *this, inv_map,
// HIP needs deduction guides to have markups so we need to be explicit
// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have
// the issue but Clang-CUDA also doesn't accept the use of deduction guide so
// disable it for CUDA altogether
#if defined(MDSPAN_IMPL_HAS_HIP) || defined(MDSPAN_IMPL_HAS_CUDA)
                        MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<decltype(detail::stride_of(slices))...>{
                            detail::stride_of(slices)...}).values),
#else
                        MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple{detail::stride_of(slices)...}).values),
#endif
          offset
    };
  }
#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
  __builtin_unreachable();
#endif
}

template <size_t PaddingValue>
template <class Extents>
template <class... SliceSpecifiers>
MDSPAN_INLINE_FUNCTION constexpr auto
MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded<PaddingValue>::mapping<Extents>::submdspan_mapping_impl(
    SliceSpecifiers... slices) const {

  // compute sub extents
  using src_ext_t = Extents;
  auto dst_ext = submdspan_extents(extents(), slices...);
  using dst_ext_t = decltype(dst_ext);

  if constexpr (Extents::rank() == 0) { // rank-0 case
    using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded<PaddingValue>::template mapping<Extents>;
    return submdspan_mapping_result<dst_mapping_t>{*this, 0};
  } else {
    // Figure out if any slice's lower bound equals the corresponding extent.
    // If so, bypass evaluating the layout mapping.  This fixes LWG Issue 4060.
    // figure out sub layout type
    const bool out_of_bounds =
        MDSPAN_IMPL_STANDARD_NAMESPACE::detail::any_slice_out_of_bounds(this->extents(), slices...);
    auto offset = static_cast<size_t>(
        out_of_bounds ? this->required_span_size()
                    : this->operator()(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::first_of(slices)...));
    if constexpr (dst_ext_t::rank() == 0) { // result rank-0
      // Same issue as in layout_left_padded: see comment there
      // using dst_mapping_t = typename layout_right::template mapping<dst_ext_t>;
      // return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t{dst_ext}, offset};
      return submdspan_mapping_result<typename layout_right::template mapping<dst_ext_t>>
        {typename layout_right::template mapping<dst_ext_t>{dst_ext}, offset};
    } else { // general case
      using deduce_layout = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::deduce_layout_right_submapping<
        typename dst_ext_t::index_type, dst_ext_t::rank(),
        decltype(std::make_index_sequence<src_ext_t::rank()>()),
        SliceSpecifiers...>;

      if constexpr (deduce_layout::layout_right_value() && dst_ext_t::rank() == 1) { // getting rank-1 from rightmost
        using dst_mapping_t = typename layout_right::template mapping<dst_ext_t>;
        return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t{dst_ext}, offset};
      } else if constexpr (deduce_layout::layout_right_padded_value()) { // can keep layout_right_padded
        constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_right<Extents, deduce_layout::gap_len, static_padding_stride>(std::make_index_sequence<Extents::rank()>());
        using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded<S_static>::template mapping<dst_ext_t>;
        return submdspan_mapping_result<dst_mapping_t>{
        dst_mapping_t(dst_ext, stride(Extents::rank() - 2 - deduce_layout::gap_len)), offset};
      } else { // layout_stride
    auto inv_map = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::inv_map_rank(std::integral_constant<size_t, 0>(),
                                        std::index_sequence<>(), slices...);
      using dst_mapping_t = typename layout_stride::template mapping<dst_ext_t>;
    return submdspan_mapping_result<dst_mapping_t> {
      dst_mapping_t(mdspan_non_standard, dst_ext,
                    MDSPAN_IMPL_STANDARD_NAMESPACE::detail::construct_sub_strides(
                        *this, inv_map,
// HIP needs deduction guides to have markups so we need to be explicit
// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have
// the issue but Clang-CUDA also doesn't accept the use of deduction guide so
// disable it for CUDA alltogether
#if defined(MDSPAN_IMPL_HAS_HIP) || defined(MDSPAN_IMPL_HAS_CUDA)
                        MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<decltype(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices))...>{
                            MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...}).values),
#else
                        MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple{MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...}).values),
#endif
          offset
    };
      }
    }
  }


#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
  __builtin_unreachable();
#endif
}

//**********************************
// layout_stride submdspan_mapping
//*********************************
template <class Extents>
template <class... SliceSpecifiers>
MDSPAN_INLINE_FUNCTION constexpr auto
layout_stride::mapping<Extents>::submdspan_mapping_impl(
    SliceSpecifiers... slices) const {
  auto dst_ext = submdspan_extents(extents(), slices...);
  using dst_ext_t = decltype(dst_ext);
  auto inv_map = detail::inv_map_rank(std::integral_constant<size_t, 0>(),
                                      std::index_sequence<>(), slices...);
  using dst_mapping_t = typename layout_stride::template mapping<dst_ext_t>;

  // Figure out if any slice's lower bound equals the corresponding extent.
  // If so, bypass evaluating the layout mapping.  This fixes LWG Issue 4060.
  const bool out_of_bounds =
      detail::any_slice_out_of_bounds(this->extents(), slices...);
  auto offset = static_cast<size_t>(
      out_of_bounds ? this->required_span_size()
                    : this->operator()(detail::first_of(slices)...));

  return submdspan_mapping_result<dst_mapping_t> {
    dst_mapping_t(mdspan_non_standard, dst_ext,
                  detail::construct_sub_strides(
                      *this, inv_map,
// HIP needs deduction guides to have markups so we need to be explicit
// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have
// the issue but Clang-CUDA also doesn't accept the use of deduction guide so
// disable it for CUDA alltogether
#if defined(MDSPAN_IMPL_HAS_HIP) || defined(MDSPAN_IMPL_HAS_CUDA)
                      MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<decltype(detail::stride_of(slices))...>(
                          detail::stride_of(slices)...)).values),
#else
                      MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple(detail::stride_of(slices)...)).values),
#endif
        offset
  };
}

} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE

#if defined __NVCC__
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diagnostic pop
#else
#ifdef __CUDA_ARCH__
#pragma diagnostic pop
#endif
#endif
#elif defined __NVCOMPILER
#pragma diagnostic pop
#endif