a242a8ed0a118246ffe67662b289e58f8dce5e1a
[blender.git] / intern / cycles / kernel / filter / filter_features_sse.h
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 CCL_NAMESPACE_BEGIN
18
19 #define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
20
21 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
22  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
23  * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
24
25 #define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
26                                  for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
27                                      __m128 y4 = _mm_set1_ps(pixel.y); \
28                                      for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
29                                          __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
30                                          __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
31
32 #define END_FOR_PIXEL_WINDOW_SSE     } \
33                                      pixel_buffer += buffer_w - (pixel.x - low.x); \
34                                  }
35
36 ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
37 {
38         features[0] = x;
39         features[1] = y;
40         features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
41         features[3] = ccl_get_feature_sse(1);
42         features[4] = ccl_get_feature_sse(2);
43         features[5] = ccl_get_feature_sse(3);
44         features[6] = ccl_get_feature_sse(4);
45         features[7] = ccl_get_feature_sse(5);
46         features[8] = ccl_get_feature_sse(6);
47         features[9] = ccl_get_feature_sse(7);
48         if(mean) {
49                 for(int i = 0; i < DENOISE_FEATURES; i++)
50                         features[i] = _mm_sub_ps(features[i], mean[i]);
51         }
52         for(int i = 0; i < DENOISE_FEATURES; i++)
53                 features[i] = _mm_mask_ps(features[i], active_pixels);
54 }
55
56 ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
57 {
58         scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
59         scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
60
61         scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
62
63         __m128 diff, scale;
64         diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
65         scale = _mm_mul_ps(diff, diff);
66         diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
67         scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
68         diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
69         scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
70         scales[3] = _mm_mask_ps(scale, active_pixels);
71
72         scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
73
74         diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
75         scale = _mm_mul_ps(diff, diff);
76         diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
77         scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
78         diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
79         scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
80         scales[5] = _mm_mask_ps(scale, active_pixels);
81 }
82
83 ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
84 {
85         scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
86         scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
87         scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
88         scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
89
90         scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
91         scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
92 }
93
94
95 CCL_NAMESPACE_END