forked from ad8e/vsync_blurbusters
-
Notifications
You must be signed in to change notification settings - Fork 0
/
render_vsync_demo.cpp
334 lines (280 loc) · 15.9 KB
/
render_vsync_demo.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#define DISABLE_FONTS 1
//miscellaneous helper files I carry around
#include "timing.cpp"
#include "console.h"
#include "renderer.h"
#include "render_present.cpp"
#include "frame_time_measurement.cpp"
#include <thread>
#include <atomic>
#include <mutex>
#if LOAD_WITH_GLAD
#include "etc/glad.c"
#endif
#include "platform_vsync.cpp" //platform-specific APIs for finding the vsync poing
#include "vsync.cpp" //calculates phase and period when vsync is grabbed in a separate thread
#include "vsync_with_scanline.cpp" //calculates phase and period when the scanline is grabbed in the render thread
namespace render {
GL_buffer<uint32_t> triangles;
//for timestamps
bool lt_circular(uint64_t a, uint64_t b) { return int64_t(a - b) < 0; }
bool lt_circular(uint32_t a, uint32_t b) { return int32_t(a - b) < 0; }
//disallow conversions, since comparison of unsigned is the whole point
template <class T, class U>
bool lt_circular(T a, U b) = delete;
void render_loop() {
glfwMakeContextCurrent(window);
tell_system_whether_to_wait_for_vsync();
#if LOAD_WITH_GLAD
check(gladLoadGLLoader((GLADloadproc)glfwGetProcAddress), "GLAD initialization failed");
#endif
#if SYNC_LINUX
prepare_sync();
#endif
get_scanline_info();
triangles.program = compile_shaders(R"(#version 330 core
layout (location = 0) in mediump vec2 pos;
layout (location = 1) in mediump vec4 vertex_color;
out mediump vec4 pixel_color;
void main() {
gl_Position = vec4(pos.x, pos.y, 0.0, 1.0);
pixel_color = vertex_color;
})",
R"(#version 330 core
out mediump vec4 color;
in mediump vec4 pixel_color;
void main() {
color = vec4(pixel_color.z, pixel_color.y, pixel_color.x, 1.0);
})");
glGenVertexArrays(1, &triangles.VAO);
glBindVertexArray(triangles.VAO);
glGenBuffers(1, &triangles.VBO);
glBindBuffer(GL_ARRAY_BUFFER, triangles.VBO);
glBufferData(GL_ARRAY_BUFFER, triangles.vertices.size() * sizeof(float), 0, GL_DYNAMIC_DRAW);
glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 12, (void*)0);
glEnableVertexAttribArray(0);
glVertexAttribPointer(1, GL_BGRA, GL_UNSIGNED_INT_2_10_10_10_REV, GL_TRUE, 12, (void*)(2 * sizeof(float)));
glEnableVertexAttribArray(1);
glClearColor(1.0, 1.0, 1.0, 1.0);
glViewport(0, 0, render::screen_w, render::screen_h);
auto time_previous_frame_start = now();
#if ANY_SYNC_SUPPORTED
uint64_t last_frame_vblank_target = time_previous_frame_start;
#endif
#if _WIN32
bool fast_timer_on_Windows = true;
improve_timer_resolution_on_Windows();
#endif
glGenQueries(frame_time_buffer_size, query_circular_buffer.data());
//variables for the animation
bool bar_flip = 0; //flips every full run
bool color_flip = 0; //flips every frame
float bar_x = 0;
while (!time_to_exit()) {
glfwPollEvents();
//uint64_t time_at_frame_start = now() + int64_t(generate_noise_for_timepoint.next_float() * ticks_per_sec / 60 / 16); //adds noise to the timepoint, for checking performance of the vsync finder
uint64_t time_at_frame_start = now(); //if spam_swap is true, no need to call this. oh well. synchronizing the behavior would be too annoying, as spam_swap can change between frames, and then the previous timestamp would be out of whack. easier to just always call the timestamp.
//vscan gives slightly less error if the scanline is before the timepoint. however, it's marginal: 0.0042 ms vs 0.0044 ms. it wobbles too. hard to tell if it's just noise.
//if it's spam-swapping, we could get it only once per vsync. however, I think I don't care.
#if SYNC_IN_RENDER_THREAD && SCANLINE_VSYNC
uint64_t scanline;
if (sync_mode == sync_in_render_thread) {
scanline = get_scanline();
vscan::new_value(time_at_frame_start, scanline); //we reuse the time at frame start. that forces our scanline operation to be next to it, so there is no decision on where in a frame the scanline retrieval should be.
update_scanline_boundaries();
}
#endif
#if SYNC_LINUX
get_sync_values();
#endif
//whether you are trying to sync to the vsync point by waiting and swapping at a tearline
bool vsync_period_phase_info_available = (sync_mode == separate_heartbeat) || (sync_mode == sync_in_render_thread);
//we want to measure GPU time to get more accurate waits.
//if the frames are taking too long, then we wouldn't be able to make use of GPU time anyway; the only possible strategy is to spam-swap, in which case the burden of measuring GPU time is a problem
//GPU timestamps are slow and heavy. CPU time is just a signal to check if we should measure this.
//(GPU time is short) || (CPU time between frames < vblank_period) = start measuring GPU time.
//thus, we only measure GPU time if we expect frame times to be below one frame, meeting one of the following conditions:
//1. the average GPU time is generally short enough (frame_time_smoothed)
//2. the most recent GPU time was short (frame_time_single). this enables a faster recovery - a single good frame leads to more measurements of more good frames.
//3. the CPU time is approximately equal to vblank_period - then sometimes we measure, sometimes not. this is a recovery mechanism and only needs to occasionally work.
//CPU time is capped from below by the vblank period, so there's no point in trying to be more reliable than grabbing the occasional instances where it dips below from noise.
//when the CPU time drops below, then GPU time measurement will kick in, and it'll stay measuring GPU time if it's appropriate.
bool measure_GPU_time_spent = false;
#if ANY_SYNC_SUPPORTED
uint64_t vblank_phase;
double vblank_period;
if (sync_mode == sync_in_render_thread) {
vblank_phase = vscan::phase;
vblank_period = vscan::period;
}
else if (sync_mode == separate_heartbeat) {
vblank_phase = vf::vblank_phase_atomic.load(std::memory_order_relaxed);
vblank_period = vf::vblank_period_atomic.load(std::memory_order_relaxed);
}
else
error_assert("implement me");
if (!spam_swap && vsync_period_phase_info_available)
measure_GPU_time_spent =
frame_time_single < vblank_period / ticks_per_sec ||
frame_time_smoothed < vblank_period / ticks_per_sec ||
time_at_frame_start - time_previous_frame_start < vblank_period;
//if frames might be on time, it's worth checking the GPU time.
//if frames are surely on time, it's worth syncing to vblank.
bool wait_and_tear = measure_GPU_time_spent && frame_time_smoothed < vblank_period / ticks_per_sec; //we need this. be safe if the vsync finder returns junk values. so bail out after calculation
//if period is more than one second. it's probably bogus information.
//if phase is more than 100 seconds away. it's not likely to be accurate.
//in both cases, just ignore it and spam-swap until we get real data
if (vblank_period > ticks_per_sec || (uint64_t)std::abs(int64_t(vblank_phase - time_at_frame_start)) > ticks_per_sec * 10) {
wait_and_tear = false;
}
uint64_t target_render_start_time;
uint64_t target_swap_time;
if (wait_and_tear) {
//outc("phase", int64_t(vblank_phase_from_wait - vblank_phase) * 1000.0 / ticks_per_sec); //the wakeup vsync mechanism is earlier than the scanline mechanism! this output produces negative values. that's because the wakeup is at the beginning of the front porch, not the vsync.
double time_between_render_start_and_tearline = frame_time_smoothed + render_overrun_buffer_room + GPU_swap_delay_undocumented;
double adjustment_for_image_presentation_late_in_frame = (double)(scanlines_between_sync_and_first_displayed_line - porch_scanlines) / total_scanlines;
double tearline_time_after_sync = user_desired_phase_offset + adjustment_for_image_presentation_late_in_frame; //aims for the end of the active display = beginning of the porch. this is because trying to render when the displayed lines go out seems to cause severe issues, so we avoid the end of the porch.
int64_t time_rel_vblank_phase = time_at_frame_start - vblank_phase;
int periods_to_move_forward_from_vblank = ceil((time_rel_vblank_phase + time_between_render_start_and_tearline * ticks_per_sec) / vblank_period - tearline_time_after_sync);
uint64_t vblank_target = vblank_phase + uint64_t(periods_to_move_forward_from_vblank * vblank_period);
if (int64_t(vblank_target - last_frame_vblank_target) < vblank_period / 2) {
//auto distance_to_ceil = [](double f) { return ceil(f) - f; };
//outc("extra wait, extra room was", distance_to_ceil((time_rel_vblank_phase + time_between_render_start_and_tearline * ticks_per_sec) / vblank_period - tearline_time_after_sync), periods_to_move_forward_from_vblank);
++periods_to_move_forward_from_vblank; //you rendered super fast and are trying to render the same frame. so wait another frame.
}
last_frame_vblank_target = vblank_phase + uint64_t(periods_to_move_forward_from_vblank * vblank_period); //re-calculate it in case periods changed
//tearline_time = vblank_phase + uint64_t((tearline_time_after_sync + periods_to_move_forward_from_vblank) * vblank_period);
target_render_start_time = vblank_phase + uint64_t((tearline_time_after_sync + periods_to_move_forward_from_vblank) * vblank_period - time_between_render_start_and_tearline * ticks_per_sec);
target_swap_time = vblank_phase + uint64_t((tearline_time_after_sync + periods_to_move_forward_from_vblank) * vblank_period - (GPU_swap_delay_undocumented + swap_time) * ticks_per_sec);
}
#endif
time_previous_frame_start = time_at_frame_start;
//it's possible this isn't capturing the CPU-side of the rendering. maybe todo.
if (measure_GPU_time_spent) {
GPU_timestamp_send();
}
//rendering starts now; we've done all the waiting we want and gathered all the information we will have.
auto time_at_render_start = now();
//the animation. generates the triangles needed
{
//if (left_click_dragging) { //we use this to not buffer anything, and only swap. this determines that the large spikes are not caused by syncing issues between bufferSubData and the device rendering thread.
bool single_bar = false;
if (single_bar) glClear(GL_COLOR_BUFFER_BIT); //lowers fps from 760 to 580.
//note: this will double clear on viewport change
uint32_t color = 1047961;
uint32_t ocolor = 1072693964;
uint32_t white = 1073741823;
if (color_flip) std::swap(color, ocolor);
color_flip = !color_flip;
auto bar = (bar_flip) ? 428867789 : 1073112064;
bar_x += 0.02f;
if (bar_x > 0.99f) {
bar_x = -2;
if (!single_bar)
bar_flip = !bar_flip;
}
float xoffset = 3 * (10) / float(screen_w); //for triangle size
float yoffset = 3 * (17.32f) / float(screen_h);
//current mouse
float screenx = 2 * mouse_x / float(screen_w) - 1.0f;
float screeny = 2 * mouse_y / float(screen_h) - 1.0f;
triangles.draw_triangle(screenx - xoffset, screeny + yoffset, color, screenx + xoffset, screeny + yoffset, ocolor, screenx, screeny, white);
//indicator at left of screen
triangles.draw_triangle(-2.f, 0.f, color, -0.98f, 2.f, color, -0.98f, -2.f, color);
//quad for the moving bar
triangles.draw_ortho(bar, bar_x, bar_x + 0.04f, (bar_x - 1) / 2, 1.f);
}
triangles.move_and_render();
#if ANY_SYNC_SUPPORTED
if (busy_wait_for_exact_swap && wait_and_tear && now() <= target_swap_time) {
//we have a wait operation. which means we must split the GPU measurement in two.
if (measure_GPU_time_spent)
GPU_timestamp_send(0);
accurate_sleep_until(target_swap_time);
if (measure_GPU_time_spent)
GPU_timestamp_send();
swap_now();
if (measure_GPU_time_spent) {
GPU_timestamp_send(1);
}
}
else
#endif
{
swap_now(); //Linux: if I turn this off, the input lag fixes itself! so glFlush is clobbering the latency of the event system
if (measure_GPU_time_spent)
GPU_timestamp_send(2);
//it's important to measure the time that swap takes, because it can be 6 ms.
//Intel HD 4000
//if we measure before the swap, without glFlush(), the timer reports 0.026 ms per frame
//if we measure after the swap, with glFlush(), the timer reports 0.53 ms per frame.
//the swap takes 0.5 ms
//Iris Xe: the delay is 0-6 ms, determined by the frequency of the GPU. you can see the GPU's frequency step around as its frequency changes.
//to check this, I installed tlp, and changed these configurations, which forced the GPU to its lowest frequency, and stabilized the tearline at 2/5 down the screen:
//INTEL_GPU_MAX_FREQ_ON_AC=100
//INTEL_GPU_BOOST_FREQ_ON_AC=100
//INTEL_GPU_MIN_FREQ_ON_AC=100
//vf::new_value(now()); //for testing how accurate GPU wakeup is. turn on double buffer vsync, (it's not possible with separate_heartbeat_vsync, because it collides with the vsync finder. you'd need to copy a new one into a separate namespace)
//if (vf::elements() > 16) outc("jitter:", vf::calc_error_in_shitty_way() * 1000.0 / ticks_per_sec);
}
if (measure_GPU_time_spent) {
//uint max_queries_to_retrieve_per_frame = 2; //if you get 1 query, you're treading. if you get 2, you're moving forward. consuming 2 over time is good enough to recover from any delay. we don't want to ask for queries more than necessary.
//however, now there can be either 2 or 4 queries. it's probably better not to set a limit.
//I tried to measure performance of the Query operations by running Query 100 times, and looking at the sine wave animation. I turned on GPU timestamp measurement and turned off the vblank sync.
//then, comment out glDeleteQueries(), because that is causing most of the jitter, which is making further jitter hard to see
//"if (!spam_swap) measure_GPU_time_spent =" -> "if (1)"
//"if (measure_GPU_time_spent) {" -> "if (0) {", where the vblank phase operations are
//results are below in the zero_to(1000) comments
while (lt_circular(index_lagging_GPU_time_to_retrieve, index_next_query_available)) {
GLint done = 0;
//for (int i : zero_to(1000)) //this checks how expensive the query availability retrieval is. interestingly, with Query deletion on, the sine wave animation is _more_ consistent when checking 100 times, than when checking once! it stops twitching back and forth different times per frame, and starts twitching evenly across frames. I assume that's bad even though it looks good.
//if I check 1000 times, and turn Query deletion off, then the animation starts skipping 2 bars (32 pixels) instead of 1 bar (16 pixel). so it's pretty expensive
glGetQueryObjectiv(query_circular_buffer[index_lagging_GPU_time_to_retrieve % frame_time_buffer_size], GL_QUERY_RESULT_AVAILABLE, &done);
if (done)
GPU_timestamp_retrieve();
else
break;
}
}
}
}
} // namespace render
void mouse_cursor_callback(GLFWwindow* window, double xpos, double ypos) {
render::mouse_x = xpos + 0.5;
render::mouse_y = render::screen_h - ypos - 0.5;
}
int main(int argc, char** argv) {
if (!glfwInit()) return -1;
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, true);
glfwWindowHint(GLFW_OPENGL_DEBUG_CONTEXT, true);
using namespace render;
if (render::sync_mode != double_buffer_vsync && !double_buffered) glfwWindowHint(GLFW_DOUBLEBUFFER, 0); //turn off double buffering if it's not used
active_monitor = glfwGetPrimaryMonitor();
const GLFWvidmode* mode = glfwGetVideoMode(active_monitor);
window = glfwCreateWindow(mode->width, mode->height, argv[0], active_monitor, nullptr);
render::screen_w = mode->width;
render::screen_h = mode->height;
if (!window) {
glfwTerminate();
return -1;
}
glfwSetCursorPosCallback(window, mouse_cursor_callback);
auto monitor_Hz = get_refresh_rate();
extern double system_claimed_monitor_Hz;
system_claimed_monitor_Hz = monitor_Hz;
if (sync_mode == sync_in_render_thread)
vscan::period = ticks_per_sec / double(monitor_Hz);
else if (sync_mode == separate_heartbeat)
vf::vblank_period_atomic.store(ticks_per_sec / double(monitor_Hz), std::memory_order_relaxed);
#if SYNC_IN_SEPARATE_THREAD
if (render::sync_mode == separate_heartbeat) {
std::thread vsync_timer(get_vsynctimes);
vsync_timer.detach();
}
#endif
render::render_loop();
glfwTerminate();
}