From: Carl Worth <cworth@cworth.org>
Date: Wed, 30 Oct 2013 21:24:52 +0000 (-0700)
Subject: Collect timer/monitor results whenever there are >1000 outstanding
X-Git-Url: https://git.cworth.org/git?p=fips;a=commitdiff_plain;h=9e6a41d89661ce7e983bac1b747755380604e447

Collect timer/monitor results whenever there are >1000 outstanding

Previously, fips always waited for a frame boundary before collecting
timer and monitor results. Now, whenever more than a maximum (set to
1000 here) number of monitors have been fired off, but no results
collected, fips will check and collect results for all timers/monitors
that have results available.

Here's some background on the debugging that led to this change:

  With an apitrace collected from "DOTA 2" we ran into crashes, always
  on the first frame of the game proper (after the opening menus,
  etc.). This frame is unusually large, (roughly half a million OpenGL
  calls).

  With that large frame, and the resulting large number of outstanding
  queries waiting to be collected, we were running into a resource
  limit and Mesa's performance-monitor code was crashing on an
  unexpectedly NULL bo->virtual pointer.

  A little digging determined that a DRM map ioctl was failing due to
  the map_count resource in the kernel being larger than the
  configured default (roughly 65530).

  After checking that neither fips nor Mesa was leaking any large
  number of buffer objects, (nor keeping many mapped), we decided to
  attempt this more aggressive collection of results in fips.

  As far as resource consumption in general, this does seem like a
  reasonable thing to do. If we have hundreds of outstanding queries,
  surely the oldest of them have completed, and we can free some
  resources by collecting those.

  On the other hand, it still seems wrong that the kernel is imposing
  an arbitrary limit on how many outstanding queries an application
  can have. The AMD_performance_monitor specification and
  implementation are not intended to have any such limitation. So,
  there's still some investigation to be done on what resource is
  causing the kernel's map_count to grow so large and to see if
  there's a bug there to be fixed.
---

diff --git a/metrics.c b/metrics.c
index eede09b..9b94eb6 100644
--- a/metrics.c
+++ b/metrics.c
@@ -105,12 +105,26 @@ typedef struct context
 
 	metrics_op_t op;
 
+	/* GL_TIME_ELAPSED query for which glEndQuery has not yet
+	 * been called. */
+	unsigned timer_begun_id;
+
+	/* GL_TIME_ELAPSED queries for which glEndQuery has been
+	 * called, (but results have not yet been queried). */
 	timer_query_t *timer_head;
 	timer_query_t *timer_tail;
 
+	/* Performance monitor for which glEndPerfMonitorAMD has not
+	 * yet been called. */
+	unsigned monitor_begun_id;
+
+	/* Performance monitors for which glEndPerfMonitorAMD has
+	 * been called, (but results have not yet been queried). */
 	monitor_t *monitor_head;
 	monitor_t *monitor_tail;
 
+	int monitors_in_flight;
+
 	unsigned num_op_metrics;
 	op_metrics_t *op_metrics;
 } context_t;
@@ -123,6 +137,11 @@ context_t current_context;
 int frames;
 int verbose;
 
+#define MAX_MONITORS_IN_FLIGHT 1000
+
+void
+metrics_collect_available (void);
+
 static void
 metrics_group_info_init (metrics_group_info_t *group, GLuint id)
 {
@@ -312,6 +331,10 @@ metrics_info_fini (void)
 	if (! info->initialized)
 		return;
 
+	if (ctx->timer_begun_id) {
+		ctx->timer_begun_id = 0;
+	}
+
 	for (timer = ctx->timer_head;
 	     timer;
 	     timer = timer_next)
@@ -322,6 +345,10 @@ metrics_info_fini (void)
 	ctx->timer_head = NULL;
 	ctx->timer_tail = NULL;
 
+	if (ctx->monitor_begun_id) {
+		ctx->monitor_begun_id = 0;
+	}
+
 	for (monitor = ctx->monitor_head;
 	     monitor;
 	     monitor = monitor_next)
@@ -332,6 +359,8 @@ metrics_info_fini (void)
 	ctx->monitor_head = NULL;
 	ctx->monitor_tail = NULL;
 
+	current_context.monitors_in_flight = 0;
+
 	for (i = 0; i < info->num_groups; i++)
 		metrics_group_info_fini (&info->groups[i]);
 
@@ -396,42 +425,12 @@ void
 metrics_counter_start (void)
 {
 	context_t *ctx = &current_context;
-	timer_query_t *timer;
-	monitor_t *monitor;
 	unsigned i;
 
-	/* Create new timer query, add to list */
-	timer = xmalloc (sizeof (timer_query_t));
-
-	timer->op = ctx->op;
-	timer->next = NULL;
-
-	if (ctx->timer_tail) {
-		ctx->timer_tail->next = timer;
-		ctx->timer_tail = timer;
-	} else {
-		ctx->timer_tail = timer;
-		ctx->timer_head = timer;
-	}
-
-	/* Create a new performance-monitor query */
-	monitor = xmalloc (sizeof (monitor_t));
-
-	monitor->op = ctx->op;
-	monitor->next = NULL;
-
-	if (ctx->monitor_tail) {
-		ctx->monitor_tail->next = monitor;
-		ctx->monitor_tail = monitor;
-	} else {
-		ctx->monitor_tail = monitor;
-		ctx->monitor_head = monitor;
-	}
-
 	/* Initialize the timer_query and monitor objects */
-	glGenQueries (1, &timer->id);
+	glGenQueries (1, &ctx->timer_begun_id);
 
-	glGenPerfMonitorsAMD (1, &monitor->id);
+	glGenPerfMonitorsAMD (1, &ctx->monitor_begun_id);
 
 	for (i = 0; i < ctx->metrics_info.num_groups; i++)
 	{
@@ -450,23 +449,69 @@ metrics_counter_start (void)
 
 		}
 
-		glSelectPerfMonitorCountersAMD(monitor->id,
+		glSelectPerfMonitorCountersAMD(ctx->monitor_begun_id,
 					       GL_TRUE, group->id,
 					       num_counters,
 					       group->counter_ids);
 	}
 
 	/* Start the queries */
-	glBeginQuery (GL_TIME_ELAPSED, timer->id);
+	glBeginQuery (GL_TIME_ELAPSED, ctx->timer_begun_id);
 
-	glBeginPerfMonitorAMD (monitor->id);
+	glBeginPerfMonitorAMD (ctx->monitor_begun_id);
 }
 
 void
 metrics_counter_stop (void)
 {
+	context_t *ctx = &current_context;
+	timer_query_t *timer;
+	monitor_t *monitor;
+
+	/* Stop the current timer and monitor. */
 	glEndQuery (GL_TIME_ELAPSED);
-	glEndPerfMonitorAMD (current_context.monitor_tail->id);
+	glEndPerfMonitorAMD (ctx->monitor_begun_id);
+
+	/* Add these IDs to our lists of outstanding queries and
+	 * monitors so the results can be collected later. */
+	timer = xmalloc (sizeof (timer_query_t));
+
+	timer->op = ctx->op;
+	timer->id = ctx->timer_begun_id;
+	timer->next = NULL;
+
+	if (ctx->timer_tail) {
+		ctx->timer_tail->next = timer;
+		ctx->timer_tail = timer;
+	} else {
+		ctx->timer_tail = timer;
+		ctx->timer_head = timer;
+	}
+
+	/* Create a new performance-monitor query */
+	monitor = xmalloc (sizeof (monitor_t));
+
+	monitor->op = ctx->op;
+	monitor->id = ctx->monitor_begun_id;
+	monitor->next = NULL;
+
+	if (ctx->monitor_tail) {
+		ctx->monitor_tail->next = monitor;
+		ctx->monitor_tail = monitor;
+	} else {
+		ctx->monitor_tail = monitor;
+		ctx->monitor_head = monitor;
+	}
+
+	ctx->monitors_in_flight++;
+
+	/* Avoid being a resource hog and collect outstanding results
+	 * once we have sent off a large number of
+	 * queries. (Presumably, many of the outstanding queries are
+	 * available by now.)
+	 */
+	if (ctx->monitors_in_flight > MAX_MONITORS_IN_FLIGHT)
+		metrics_collect_available ();
 }
 
 void
@@ -824,24 +869,10 @@ metrics_exit (void)
 	metrics_info_fini ();
 }
 
-
 void
-metrics_end_frame (void)
+metrics_collect_available (void)
 {
 	context_t *ctx = &current_context;
-	static int initialized = 0;
-	static struct timeval tv_start, tv_now;
-
-	if (! initialized) {
-		gettimeofday (&tv_start, NULL);
-		atexit (metrics_exit);
-		if (getenv ("FIPS_VERBOSE"))
-			verbose = 1;
-		initialized = 1;
-	}
-
-	frames++;
-	gettimeofday (&tv_now, NULL);
 
 	/* Consume all timer queries that are ready. */
 	timer_query_t *timer = ctx->timer_head;
@@ -906,12 +937,37 @@ metrics_end_frame (void)
 		glDeletePerfMonitorsAMD (1, &monitor->id);
 
 		free (monitor);
+
+		ctx->monitors_in_flight--;
+
 		monitor = ctx->monitor_head;
 	}
+}
+
+
+void
+metrics_end_frame (void)
+{
+	static int initialized = 0;
+	static struct timeval tv_start, tv_now;
+
+	if (! initialized) {
+		gettimeofday (&tv_start, NULL);
+		atexit (metrics_exit);
+		if (getenv ("FIPS_VERBOSE"))
+			verbose = 1;
+		initialized = 1;
+	}
+
+	frames++;
+
+	metrics_collect_available ();
 
 	if (frames % 15 == 0) {
 		double fps;
 
+		gettimeofday (&tv_now, NULL);
+
 		fps = (double) frames / (tv_now.tv_sec - tv_start.tv_sec +
 					 (tv_now.tv_usec - tv_start.tv_usec) / 1.0e6);