@@ -5903,6 +5903,13 @@ static int llama_decode_internal(
59035903
59045904 ggml_allocr_alloc_graph (lctx.alloc , gf);
59055905
5906+ struct ggml_tensor * res = gf->nodes [gf->n_nodes - 1 ];
5907+ struct ggml_tensor * embeddings = gf->nodes [gf->n_nodes - 2 ];
5908+
5909+ GGML_ASSERT (strcmp (res->name , " result_output" ) == 0 );
5910+ GGML_ASSERT (strcmp (embeddings->name , " result_norm" ) == 0 );
5911+
5912+
59065913#ifdef GGML_USE_CUBLAS
59075914 for (int i = 0 ; i < gf->n_leafs ; i++) {
59085915 ggml_tensor * node = gf->leafs [i];
@@ -5920,6 +5927,12 @@ static int llama_decode_internal(
59205927 }
59215928
59225929 ggml_cuda_set_mul_mat_q (cparams.mul_mat_q );
5930+
5931+ // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
5932+ if (!lctx.embedding .empty ()) {
5933+ embeddings->backend = GGML_BACKEND_CPU;
5934+ }
5935+ res->backend = GGML_BACKEND_CPU;
59235936#endif
59245937
59255938 // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -5944,12 +5957,6 @@ static int llama_decode_internal(
59445957 n_threads = 1 ;
59455958 }
59465959
5947- struct ggml_tensor * res = gf->nodes [gf->n_nodes - 1 ];
5948- struct ggml_tensor * embeddings = gf->nodes [gf->n_nodes - 2 ];
5949-
5950- GGML_ASSERT (strcmp (res->name , " result_output" ) == 0 );
5951- GGML_ASSERT (strcmp (embeddings->name , " result_norm" ) == 0 );
5952-
59535960#if GGML_USE_MPI
59545961 const int64_t n_layer = hparams.n_layer ;
59555962 ggml_mpi_graph_compute_pre (lctx.ctx_mpi , gf, n_layer);
0 commit comments