Skip to content

sway+chromium crashes when an execbuffer relocation references a I915_MADV_DONTNEED object #5128

Open
@thejh

Description

I'm still in the process of debugging this, but figured I'd file an issue here for now to track things, in case anyone else is concurrently trying to figure out the same issue:

When running a build of Chromium git master with native wayland support (built with use_ozone = true, invoked with --ozone-platform=wayland) on a system with integrated Intel graphics, sway occasionally dies with the error message i965: Failed to submit batchbuffer: Bad address, which comes from mesa's submit_batch(). The cause of this error message appears to be that the execbuffer's validation list contains entries pointing to objects in I915_MADV_DONTNEED state.

My suspicion so far is that the core problem is somewhere in Chromium, and mesa is making things worse by not being able to handle this condition gracefully; but I'm not sure yet.

To be able to reproduce this more reliably, you can hack a new API for testing the status of an object into the kernel:

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5f6e63952821..371ccb3aa6db 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1024,6 +1024,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 	switch (args->madv) {
 	case I915_MADV_DONTNEED:
 	case I915_MADV_WILLNEED:
+	case 0x42:
 	    break;
 	default:
 	    return -EINVAL;
@@ -1037,7 +1038,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 	if (err)
 		goto out;
 
-	if (i915_gem_object_has_pages(obj) &&
+	if (args->madv != 0x42 && i915_gem_object_has_pages(obj) &&
 	    i915_gem_object_is_tiled(obj) &&
 	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
 		if (obj->mm.madv == I915_MADV_WILLNEED) {
@@ -1052,10 +1053,12 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 		}
 	}
 
-	if (obj->mm.madv != __I915_MADV_PURGED)
+	if (obj->mm.madv != __I915_MADV_PURGED && args->madv != 0x42)
 		obj->mm.madv = args->madv;
+	if (args->madv == 0x42)
+		args->madv = obj->mm.madv;
 
-	if (i915_gem_object_has_pages(obj)) {
+	if (args->madv != 0x42 && i915_gem_object_has_pages(obj)) {
 		struct list_head *list;
 
 		if (i915_gem_object_is_shrinkable(obj)) {

and then teach mesa to use that when adding things to an execbuffer's validation list (I really hope this is actually correct and doesn't just trigger false warnings):

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index f1465ed3556..548d44458c2 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -58,6 +58,17 @@ intel_batchbuffer_reset(struct brw_context *brw);
 static void
 brw_new_batch(struct brw_context *brw);
 
+static void assert_needed(__DRIscreen *dri_screen, unsigned int handle)
+{
+   struct drm_i915_gem_madvise madv = {
+      .handle = handle,
+      .madv = 0x42,
+      .retained = -1,
+   };
+   assert(ioctl(dri_screen->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv) == 0);
+   assert(madv.madv == I915_MADV_WILLNEED);
+}
+
 static void
 dump_validation_list(struct intel_batchbuffer *batch)
 {
@@ -177,6 +188,9 @@ static unsigned
 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
 {
    assert(bo->bufmgr == batch->batch.bo->bufmgr);
+   struct brw_context *brw = NULL;
+   brw = container_of(batch, brw, batch);
+   __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 
    unsigned index = READ_ONCE(bo->index);
 
@@ -201,6 +215,8 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
                  batch->exec_array_size * sizeof(batch->validation_list[0]));
    }
 
+   assert_needed(dri_screen, bo->gem_handle);
+
    batch->validation_list[batch->exec_count] =
       (struct drm_i915_gem_exec_object2) {
          .handle = bo->gem_handle,
@@ -783,6 +799,13 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
          assert(entry->handle == batch->state.bo->gem_handle);
          entry->relocation_count = batch->state_relocs.reloc_count;
          entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
+
+         if (!batch->use_batch_first) {
+            for (int i=0; i<batch->state_relocs.reloc_count; i++) {
+               struct drm_i915_gem_relocation_entry *entry = batch->state_relocs.relocs + i;
+               assert_needed(dri_screen, entry->target_handle);
+            }
+         }
       }
 
       /* Set batchbuffer relocations */
@@ -791,6 +814,13 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
       entry->relocation_count = batch->batch_relocs.reloc_count;
       entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
 
+      if (!batch->use_batch_first) {
+         for (int i=0; i<batch->batch_relocs.reloc_count; i++) {
+            struct drm_i915_gem_relocation_entry *entry = batch->batch_relocs.relocs + i;
+            assert_needed(dri_screen, entry->target_handle);
+         }
+      }
+
       if (batch->use_batch_first) {
          flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
       } else {
@@ -808,6 +838,10 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
          batch->exec_bos[index] = tmp_bo;
       }
 
+      for (int i=0; i<batch->exec_count; i++) {
+         assert_needed(dri_screen, batch->validation_list[i].handle);
+      }
+
       ret = execbuffer(dri_screen->fd, batch, brw->hw_ctx,
                        4 * USED_BATCH(*batch),
                        in_fence_fd, out_fence_fd, flags);
@@ -926,6 +960,9 @@ emit_reloc(struct intel_batchbuffer *batch,
            struct brw_bo *target, int32_t target_offset,
            unsigned int reloc_flags)
 {
+   struct brw_context *brw = NULL;
+   brw = container_of(batch, brw, batch);
+   __DRIscreen *dri_screen = brw->screen->driScrnPriv;
    assert(target != NULL);
 
    if (target->kflags & EXEC_OBJECT_PINNED) {
@@ -961,6 +998,9 @@ emit_reloc(struct intel_batchbuffer *batch,
    if (reloc_flags)
       entry->flags |= reloc_flags & batch->valid_reloc_flags;
 
+   if (!batch->use_batch_first)
+      assert_needed(dri_screen, target->gem_handle);
+
    rlist->relocs[rlist->reloc_count++] =
       (struct drm_i915_gem_relocation_entry) {
          .offset = offset,

The assert triggers as soon as I launch Chromium.

Based on some debugging I've done with strace, the handles referenced by these validation list entries seem to be for relocations, and seem to be allocated via DRM_IOCTL_PRIME_FD_TO_HANDLE, from here:

77661 ioctl(9, DRM_IOCTL_PRIME_FD_TO_HANDLE <unfinished ...>
[...]
77661 <... ioctl resumed>, {handle=0x19, flags=0, fd=55}) = 0
 > /lib/x86_64-linux-gnu/libc-2.30.so(ioctl+0x7) [0xf43e7]
 > /usr/lib/x86_64-linux-gnu/libdrm.so.2.4.0(drmIoctl+0x27) [0x5cc7]
 > /usr/lib/x86_64-linux-gnu/libdrm.so.2.4.0(drmPrimeFDToHandle+0x33) [0x8e63]
 > /usr/local/lib/x86_64-linux-gnu/dri/i965_dri.so(brw_bo_gem_create_from_prime_internal+0x3a) [0x131d49]
 > /usr/local/lib/x86_64-linux-gnu/dri/i965_dri.so(brw_bo_gem_create_from_prime_tiled+0x5a) [0x132039]
 > /usr/local/lib/x86_64-linux-gnu/dri/i965_dri.so(intel_create_image_from_fds_common+0x1ad) [0x17faf1]
 > /usr/local/lib/x86_64-linux-gnu/dri/i965_dri.so(intel_create_image_from_dma_bufs2+0x6f) [0x1801a5]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(dri2_create_image_dma_buf+0x225) [0x1fb99]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(dri2_create_image_khr+0xf9) [0x2037f]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(dri2_drm_create_image_khr+0x66) [0x27610]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(dri2_create_image+0x5c) [0x1e4a6]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(_eglCreateImageCommon+0x13f) [0xf4ba]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(eglCreateImageKHR+0x83) [0xf5a1]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(wlr_egl_create_image_from_dmabuf+0x48e) [0x29340]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(wlr_gles2_texture_from_dmabuf+0x1a2) [0x2c1d3]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(gles2_texture_from_dmabuf+0x36) [0x2ab47]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(wlr_texture_from_dmabuf+0x47) [0x2cf48]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(wlr_client_buffer_create+0x16e) [0x60fa4]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(surface_apply_damage+0xb8) [0x7e45d]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(surface_commit_pending+0x11d) [0x7e6ed]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(surface_commit+0x67) [0x7ea52]
 > /usr/lib/x86_64-linux-gnu/libffi.so.7.1.0(ffi_call_unix64+0x54) [0x6ccc]
 > /usr/lib/x86_64-linux-gnu/libffi.so.7.1.0(ffi_call_int+0x1a9) [0x6259]
 > /usr/lib/x86_64-linux-gnu/libwayland-server.so.0.1.0(wl_closure_invoke+0x15f) [0xd37f]
 > /usr/lib/x86_64-linux-gnu/libwayland-server.so.0.1.0(wl_client_connection_data+0x231) [0x97f1]
 > /usr/lib/x86_64-linux-gnu/libwayland-server.so.0.1.0(wl_event_loop_dispatch+0xc1) [0xb401]
 > /usr/lib/x86_64-linux-gnu/libwayland-server.so.0.1.0(wl_display_run+0x24) [0x99e4]
 > /usr/local/bin/sway(server_run+0x47) [0x1ef4f]
 > /usr/local/bin/sway(main+0x583) [0x1e532]
 > /lib/x86_64-linux-gnu/libc-2.30.so(__libc_start_main+0xea) [0x26e0a]
 > /usr/local/bin/sway(_start+0x29) [0x102b9]

and in at least one case, the handle actually initially was I915_MADV_WILLNEED, but then became I915_MADV_DONTNEED at a later point without any relevant-looking syscalls from sway.

The file descriptor from which the handle was created was received from Chromium over a unix domain socket; in Chromium, the file descriptor comes from this spot:

77702 fcntl(714, F_DUPFD_CLOEXEC, 0 <unfinished ...>
[...]
77702 <... fcntl resumed>)              = 720
 > /lib/x86_64-linux-gnu/libc-2.30.so(__fcntl64_nocancel_adjusted+0x20) [0xf34a0]
 > /lib/x86_64-linux-gnu/libc-2.30.so(__libc_fcntl64+0x43) [0xeea43]
 > /lib/x86_64-linux-gnu/libpthread-2.30.so(fcntl@GLIBC_2.2.5+0x3a) [0x12d9a]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(wl_os_dupfd_cloexec+0x1d) [0xdb95d]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(wl_closure_marshal+0x10a) [0xda0da]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(wl_proxy_marshal_array_constructor_versioned+0x16e) [0xd792e]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(wl_proxy_marshal+0x10f) [0xd7aef]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(ui::WaylandZwpLinuxDmabuf::CreateBuffer(base::ScopedGeneric<int, base::internal::ScopedFDCloseTraits>, gfx::Size const&, std::__Cr::vector<unsigned int, std::__Cr::allocator<unsigned int> > const&, std::__Cr::vector<unsigned int, std::__Cr::allocator<unsigned int> > const&, std::__Cr::vector<unsigned long, std::__Cr::allocator<unsigned long> > const&, unsigned int, unsigned int, base::OnceCallback<void (wl::Object<wl_buffer>)>)+0x92) [0xc6fa2]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(ui::WaylandBufferManagerHost::CreateDmabufBasedBuffer(mojo::PlatformHandle, gfx::Size const&, std::__Cr::vector<unsigned int, std::__Cr::allocator<unsigned int> > const&, std::__Cr::vector<unsigned int, std::__Cr::allocator<unsigned int> > const&, std::__Cr::vector<unsigned long, std::__Cr::allocator<unsigned long> > const&, unsigned int, unsigned int, unsigned int)+0x22b) [0xb0c9b]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(ui::ozone::mojom::WaylandBufferManagerHostStubDispatch::Accept(ui::ozone::mojom::WaylandBufferManagerHost*, mojo::Message*)+0x8d2) [0xce462]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::InterfaceEndpointClient::HandleValidatedMessage(mojo::Message*)+0x295) [0x25095]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::internal::MultiplexRouter::ProcessIncomingMessage(mojo::internal::MultiplexRouter::MessageWrapper*, mojo::internal::MultiplexRouter::ClientCallBehavior, base::SequencedTaskRunner*)+0x2f7) [0x2b8c7]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::internal::MultiplexRouter::Accept(mojo::Message*)+0x116) [0x2b0b6]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::Connector::DispatchMessage(mojo::Message)+0xf4) [0x202a4]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::Connector::DispatchNextMessageInQueue()+0x18e) [0x1ff6e]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::TaskAnnotator::RunTask(char const*, base::PendingTask*)+0x12a) [0x18656a]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::DoWorkImpl(base::sequence_manager::LazyNow*, bool*)+0x17d) [0x1990fd]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::DoSomeWork()+0x9f) [0x198e8f]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::MessagePumpGlib::Run(base::MessagePump::Delegate*)+0x89) [0x13f0d9]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::Run(bool, base::TimeDelta)+0x74) [0x199904]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::RunLoop::Run()+0x18d) [0x169e1d]
 > /home/jann/chromium/src/out/mybuild_stable/chrome(ChromeBrowserMainParts::MainMessageLoopRun(int*)+0x72) [0x152a0e2]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::BrowserMainLoop::RunMainMessageLoopParts()+0x3a) [0xbb71ca]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::BrowserMainRunnerImpl::Run()+0x11) [0xbb8f51]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::BrowserMain(content::MainFunctionParams const&)+0xcc) [0xbb477c]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::ContentMainRunnerImpl::RunServiceManager(content::MainFunctionParams&, bool)+0x3c8) [0x13c9668]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::ContentMainRunnerImpl::Run(bool)+0x126) [0x13c9276]
 > /home/jann/chromium/src/out/mybuild_stable/libembedder.so(service_manager::Main(service_manager::MainParams const&)+0xd33) [0xf753]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::ContentMain(content::ContentMainParams const&)+0x80) [0x13c74d0]
 > /home/jann/chromium/src/out/mybuild_stable/chrome(ChromeMain+0xcd) [0xf11cfd]
 > /lib/x86_64-linux-gnu/libc-2.30.so(__libc_start_main+0xea) [0x26e0a]
 > /home/jann/chromium/src/out/mybuild_stable/chrome(_start+0x29) [0xf11b39]

Activity

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Assignees

No one assigned

    Labels

    bugNot working as intendedclient-compatCompatibility issues with a particular application

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions