Skip to content

Commit 62c7529

Browse files
committed
Handle non-trivially-copyable types in Loop/Scan output concatenation
ConcatenateCpuOutput byte-copied tensor data via gsl::copy over a span<const std::byte> cast, which is incorrect for std::string elements (not trivially copyable). Add an IsDataTypeString() branch that uses DataAsSpan and std::copy with proper copy-assignment semantics. Similarly, OutputIterator::ZeroOutCurrent used memset(0) which would corrupt std::string objects. Since the strings are already default- constructed (empty) from placement-new, simply return early. Add tests exercising string scan outputs (single and multi-element), string loop-carried variables, and zero-iteration edge case.
1 parent 4bfb60e commit 62c7529

3 files changed

Lines changed: 338 additions & 12 deletions

File tree

onnxruntime/core/providers/cpu/controlflow/loop.cc

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -273,13 +273,9 @@ static Status ConcatenateCpuOutput(void* /*stream*/,
273273
void* output, size_t output_size_in_bytes) {
274274
const auto& first_output = per_iteration_output.front().Get<Tensor>();
275275
const auto& per_iteration_shape = first_output.Shape();
276+
const bool is_string = first_output.IsDataTypeString();
276277
size_t bytes_per_iteration = first_output.SizeInBytes();
277278

278-
// we can't easily use a C++ template for the tensor element type,
279-
// so use a span for some protection but work in bytes
280-
gsl::span<std::byte> output_span = gsl::make_span<std::byte>(static_cast<std::byte*>(output),
281-
output_size_in_bytes);
282-
283279
for (size_t i = 0, num_iterations = per_iteration_output.size(); i < num_iterations; ++i) {
284280
auto& ort_value = per_iteration_output[i];
285281
auto& iteration_data = ort_value.Get<Tensor>();
@@ -290,10 +286,21 @@ static Status ConcatenateCpuOutput(void* /*stream*/,
290286
" Expected:", per_iteration_shape, " Got:", iteration_data.Shape());
291287
}
292288

293-
auto src = gsl::make_span<const std::byte>(static_cast<const std::byte*>(iteration_data.DataRaw()),
294-
bytes_per_iteration);
295-
auto dst = output_span.subspan(i * bytes_per_iteration, bytes_per_iteration);
296-
gsl::copy(src, dst);
289+
if (is_string) {
290+
// std::string is not trivially copyable — must use proper copy semantics
291+
auto src = iteration_data.DataAsSpan<std::string>();
292+
auto* dst_begin = static_cast<std::string*>(output) + i * first_output.Shape().Size();
293+
std::copy(src.begin(), src.end(), dst_begin);
294+
} else {
295+
// we can't easily use a C++ template for the tensor element type,
296+
// so use a span for some protection but work in bytes
297+
gsl::span<std::byte> output_span = gsl::make_span<std::byte>(static_cast<std::byte*>(output),
298+
output_size_in_bytes);
299+
auto src = gsl::make_span<const std::byte>(static_cast<const std::byte*>(iteration_data.DataRaw()),
300+
bytes_per_iteration);
301+
auto dst = output_span.subspan(i * bytes_per_iteration, bytes_per_iteration);
302+
gsl::copy(src, dst);
303+
}
297304
}
298305

299306
return Status::OK();

onnxruntime/core/providers/cpu/controlflow/scan_utils.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,13 @@ class OutputIterator {
106106

107107
// set the output for the current iteration to zeros. used for short sequence lengths
108108
Status ZeroOutCurrent() {
109-
auto status = Status::OK();
110109
auto* tensor = (**this).GetMutable<Tensor>();
111-
status = zero_data_func_(tensor->MutableDataRaw(), tensor->SizeInBytes());
112-
return status;
110+
if (tensor->IsDataTypeString()) {
111+
// std::string is not trivially copyable — memset would corrupt the objects.
112+
// The strings are already default-constructed (empty) from placement-new so nothing to do.
113+
return Status::OK();
114+
}
115+
return zero_data_func_(tensor->MutableDataRaw(), tensor->SizeInBytes());
113116
}
114117

115118
const OrtValue& GetOutput() const {

onnxruntime/test/providers/cpu/controlflow/loop_test.cc

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,6 +1041,322 @@ TEST(Loop, IterationCountAsOutput) {
10411041
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
10421042
}
10431043

1044+
// Verify that Loop correctly handles tensor(string) scan outputs.
1045+
// Strings are not trivially copyable so the concatenation path must use proper copy semantics.
1046+
// Uses strings exceeding the small-string-optimization threshold to exercise heap-allocated payloads.
1047+
TEST(Loop, StringScanOutput) {
1048+
auto create_subgraph = []() {
1049+
Model model("String scan output subgraph", false, DefaultLoggingManager().DefaultLogger());
1050+
auto& graph = model.MainGraph();
1051+
1052+
std::vector<NodeArg*> inputs;
1053+
std::vector<NodeArg*> outputs;
1054+
1055+
/* Subgraph produces a constant string tensor as a scan output each iteration.
1056+
1057+
iter_num_in cond_in
1058+
(unused) |
1059+
[Identity]
1060+
|
1061+
cond_out
1062+
1063+
[Constant] -> scan_output (string tensor, shape {1})
1064+
*/
1065+
1066+
// graph input types
1067+
TypeProto int64_scalar;
1068+
int64_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
1069+
int64_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1070+
1071+
TypeProto bool_scalar;
1072+
bool_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
1073+
bool_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1074+
1075+
TypeProto string_tensor;
1076+
string_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_STRING);
1077+
string_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1078+
1079+
// graph inputs
1080+
auto& iter_num_in = graph.GetOrCreateNodeArg("iter_num_in", &int64_scalar);
1081+
auto& cond_in = graph.GetOrCreateNodeArg("cond_in", &bool_scalar);
1082+
1083+
// graph outputs
1084+
auto& cond_out = graph.GetOrCreateNodeArg("cond_out", &bool_scalar);
1085+
auto& scan_out = graph.GetOrCreateNodeArg("scan_out", &string_tensor);
1086+
1087+
// cond_in -> cond_out
1088+
{
1089+
inputs = {&cond_in};
1090+
outputs = {&cond_out};
1091+
graph.AddNode("cond_identity", "Identity", "Forward cond", inputs, outputs);
1092+
}
1093+
1094+
// Constant -> scan_out (string long enough to exceed SSO)
1095+
{
1096+
TensorProto value_tensor;
1097+
value_tensor.set_name("string_const");
1098+
value_tensor.add_dims(1);
1099+
value_tensor.set_data_type(TensorProto_DataType_STRING);
1100+
// Use a string longer than typical SSO buffer (>22 chars) to ensure heap allocation
1101+
value_tensor.add_string_data("this_string_exceeds_sso_threshold_and_uses_heap_allocation");
1102+
1103+
auto& constant_node = graph.AddNode("string_constant", "Constant", "String constant",
1104+
{}, {&scan_out});
1105+
constant_node.AddAttribute("value", value_tensor);
1106+
}
1107+
1108+
graph.SetInputs({&iter_num_in, &cond_in});
1109+
graph.SetOutputs({&cond_out, &scan_out});
1110+
1111+
auto status = graph.Resolve();
1112+
EXPECT_EQ(status, Status::OK());
1113+
1114+
return graph.ToGraphProto();
1115+
};
1116+
1117+
OpTester test("Loop", 11);
1118+
auto body = create_subgraph();
1119+
test.AddAttribute<GraphProto>("body", body);
1120+
test.AddInput<int64_t>("M", {1}, {3});
1121+
test.AddInput<bool>("cond", {1}, {true});
1122+
1123+
// scan output: 3 iterations, each producing a {1} string tensor -> final shape {3, 1}
1124+
test.AddOutput<std::string>("scan_out_final", {3, 1},
1125+
{"this_string_exceeds_sso_threshold_and_uses_heap_allocation",
1126+
"this_string_exceeds_sso_threshold_and_uses_heap_allocation",
1127+
"this_string_exceeds_sso_threshold_and_uses_heap_allocation"});
1128+
1129+
// Only CPU EP supports string tensors.
1130+
std::vector<std::unique_ptr<IExecutionProvider>> eps;
1131+
eps.push_back(DefaultCpuExecutionProvider());
1132+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps);
1133+
}
1134+
1135+
// Verify multi-element string scan output (shape {2} per iteration).
1136+
TEST(Loop, StringScanOutputMultiElement) {
1137+
auto create_subgraph = []() {
1138+
Model model("Multi-element string scan output", false, DefaultLoggingManager().DefaultLogger());
1139+
auto& graph = model.MainGraph();
1140+
1141+
std::vector<NodeArg*> inputs;
1142+
std::vector<NodeArg*> outputs;
1143+
1144+
// graph input types
1145+
TypeProto int64_scalar;
1146+
int64_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
1147+
int64_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1148+
1149+
TypeProto bool_scalar;
1150+
bool_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
1151+
bool_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1152+
1153+
TypeProto string_tensor;
1154+
string_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_STRING);
1155+
string_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(2);
1156+
1157+
// graph inputs
1158+
auto& iter_num_in = graph.GetOrCreateNodeArg("iter_num_in", &int64_scalar);
1159+
auto& cond_in = graph.GetOrCreateNodeArg("cond_in", &bool_scalar);
1160+
1161+
// graph outputs
1162+
auto& cond_out = graph.GetOrCreateNodeArg("cond_out", &bool_scalar);
1163+
auto& scan_out = graph.GetOrCreateNodeArg("scan_out", &string_tensor);
1164+
1165+
// cond_in -> cond_out
1166+
{
1167+
inputs = {&cond_in};
1168+
outputs = {&cond_out};
1169+
graph.AddNode("cond_identity", "Identity", "Forward cond", inputs, outputs);
1170+
}
1171+
1172+
// Constant -> scan_out with 2 elements
1173+
{
1174+
TensorProto value_tensor;
1175+
value_tensor.set_name("string_const");
1176+
value_tensor.add_dims(2);
1177+
value_tensor.set_data_type(TensorProto_DataType_STRING);
1178+
value_tensor.add_string_data("first_heap_allocated_string_that_exceeds_sso_buffer_size");
1179+
value_tensor.add_string_data("second_heap_allocated_string_that_exceeds_sso_buffer_size");
1180+
1181+
auto& constant_node = graph.AddNode("string_constant", "Constant", "String constant",
1182+
{}, {&scan_out});
1183+
constant_node.AddAttribute("value", value_tensor);
1184+
}
1185+
1186+
graph.SetInputs({&iter_num_in, &cond_in});
1187+
graph.SetOutputs({&cond_out, &scan_out});
1188+
1189+
auto status = graph.Resolve();
1190+
EXPECT_EQ(status, Status::OK());
1191+
1192+
return graph.ToGraphProto();
1193+
};
1194+
1195+
OpTester test("Loop", 11);
1196+
auto body = create_subgraph();
1197+
test.AddAttribute<GraphProto>("body", body);
1198+
test.AddInput<int64_t>("M", {1}, {2});
1199+
test.AddInput<bool>("cond", {1}, {true});
1200+
1201+
// scan output: 2 iterations x {2} elements -> {2, 2}
1202+
test.AddOutput<std::string>("scan_out_final", {2, 2},
1203+
{"first_heap_allocated_string_that_exceeds_sso_buffer_size",
1204+
"second_heap_allocated_string_that_exceeds_sso_buffer_size",
1205+
"first_heap_allocated_string_that_exceeds_sso_buffer_size",
1206+
"second_heap_allocated_string_that_exceeds_sso_buffer_size"});
1207+
1208+
// Only CPU EP supports string tensors.
1209+
std::vector<std::unique_ptr<IExecutionProvider>> eps;
1210+
eps.push_back(DefaultCpuExecutionProvider());
1211+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps);
1212+
}
1213+
1214+
// Verify Loop with a string loop-carried variable (uses IDataTransfer::CopyTensor path).
1215+
TEST(Loop, StringLoopCarriedVar) {
1216+
auto create_subgraph = []() {
1217+
Model model("String loop-carried var subgraph", false, DefaultLoggingManager().DefaultLogger());
1218+
auto& graph = model.MainGraph();
1219+
1220+
std::vector<NodeArg*> inputs;
1221+
std::vector<NodeArg*> outputs;
1222+
1223+
/* Subgraph passes through a string loop-carried variable unchanged.
1224+
1225+
iter_num_in cond_in loop_var_in (string)
1226+
(unused) | |
1227+
[Identity] [Identity]
1228+
| |
1229+
cond_out loop_var_out
1230+
*/
1231+
1232+
TypeProto int64_scalar;
1233+
int64_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
1234+
int64_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1235+
1236+
TypeProto bool_scalar;
1237+
bool_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
1238+
bool_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1239+
1240+
TypeProto string_tensor;
1241+
string_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_STRING);
1242+
string_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1243+
1244+
auto& iter_num_in = graph.GetOrCreateNodeArg("iter_num_in", &int64_scalar);
1245+
auto& cond_in = graph.GetOrCreateNodeArg("cond_in", &bool_scalar);
1246+
auto& loop_var_in = graph.GetOrCreateNodeArg("loop_var_in", &string_tensor);
1247+
1248+
auto& cond_out = graph.GetOrCreateNodeArg("cond_out", &bool_scalar);
1249+
auto& loop_var_out = graph.GetOrCreateNodeArg("loop_var_out", &string_tensor);
1250+
1251+
// cond_in -> cond_out
1252+
{
1253+
inputs = {&cond_in};
1254+
outputs = {&cond_out};
1255+
graph.AddNode("cond_identity", "Identity", "Forward cond", inputs, outputs);
1256+
}
1257+
1258+
// loop_var_in -> loop_var_out
1259+
{
1260+
inputs = {&loop_var_in};
1261+
outputs = {&loop_var_out};
1262+
graph.AddNode("var_identity", "Identity", "Forward loop var", inputs, outputs);
1263+
}
1264+
1265+
graph.SetInputs({&iter_num_in, &cond_in, &loop_var_in});
1266+
graph.SetOutputs({&cond_out, &loop_var_out});
1267+
1268+
auto status = graph.Resolve();
1269+
EXPECT_EQ(status, Status::OK());
1270+
1271+
return graph.ToGraphProto();
1272+
};
1273+
1274+
OpTester test("Loop", 11);
1275+
auto body = create_subgraph();
1276+
test.AddAttribute<GraphProto>("body", body);
1277+
test.AddInput<int64_t>("M", {1}, {3});
1278+
test.AddInput<bool>("cond", {1}, {true});
1279+
test.AddInput<std::string>("loop_var_init", {1},
1280+
{"a_long_string_value_that_definitely_exceeds_the_sso_threshold"});
1281+
1282+
test.AddOutput<std::string>("loop_var_final", {1},
1283+
{"a_long_string_value_that_definitely_exceeds_the_sso_threshold"});
1284+
1285+
// Only CPU EP supports string tensors.
1286+
std::vector<std::unique_ptr<IExecutionProvider>> eps;
1287+
eps.push_back(DefaultCpuExecutionProvider());
1288+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps);
1289+
}
1290+
1291+
// Verify Loop with zero trip count produces empty scan output for strings.
1292+
TEST(Loop, StringScanOutputZeroIterations) {
1293+
auto create_subgraph = []() {
1294+
Model model("String scan output zero iter", false, DefaultLoggingManager().DefaultLogger());
1295+
auto& graph = model.MainGraph();
1296+
1297+
std::vector<NodeArg*> inputs;
1298+
std::vector<NodeArg*> outputs;
1299+
1300+
TypeProto int64_scalar;
1301+
int64_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
1302+
int64_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1303+
1304+
TypeProto bool_scalar;
1305+
bool_scalar.mutable_tensor_type()->set_elem_type(TensorProto_DataType_BOOL);
1306+
bool_scalar.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1307+
1308+
TypeProto string_tensor;
1309+
string_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_STRING);
1310+
string_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
1311+
1312+
auto& iter_num_in = graph.GetOrCreateNodeArg("iter_num_in", &int64_scalar);
1313+
auto& cond_in = graph.GetOrCreateNodeArg("cond_in", &bool_scalar);
1314+
1315+
auto& cond_out = graph.GetOrCreateNodeArg("cond_out", &bool_scalar);
1316+
auto& scan_out = graph.GetOrCreateNodeArg("scan_out", &string_tensor);
1317+
1318+
{
1319+
inputs = {&cond_in};
1320+
outputs = {&cond_out};
1321+
graph.AddNode("cond_identity", "Identity", "Forward cond", inputs, outputs);
1322+
}
1323+
1324+
{
1325+
TensorProto value_tensor;
1326+
value_tensor.set_name("string_const");
1327+
value_tensor.add_dims(1);
1328+
value_tensor.set_data_type(TensorProto_DataType_STRING);
1329+
value_tensor.add_string_data("never_produced_because_zero_iterations");
1330+
1331+
auto& constant_node = graph.AddNode("string_constant", "Constant", "String constant",
1332+
{}, {&scan_out});
1333+
constant_node.AddAttribute("value", value_tensor);
1334+
}
1335+
1336+
graph.SetInputs({&iter_num_in, &cond_in});
1337+
graph.SetOutputs({&cond_out, &scan_out});
1338+
1339+
auto status = graph.Resolve();
1340+
EXPECT_EQ(status, Status::OK());
1341+
1342+
return graph.ToGraphProto();
1343+
};
1344+
1345+
OpTester test("Loop", 11);
1346+
auto body = create_subgraph();
1347+
test.AddAttribute<GraphProto>("body", body);
1348+
test.AddInput<int64_t>("M", {1}, {0});
1349+
test.AddInput<bool>("cond", {1}, {true});
1350+
1351+
// Zero iterations -> scan output shape {0, 1} with no elements
1352+
test.AddOutput<std::string>("scan_out_final", {0, 1}, {});
1353+
1354+
// Only CPU EP supports string tensors.
1355+
std::vector<std::unique_ptr<IExecutionProvider>> eps;
1356+
eps.push_back(DefaultCpuExecutionProvider());
1357+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps);
1358+
}
1359+
10441360
#if defined(USE_CUDA)
10451361
// test that when part of the subgraph run on CUDA it executes successfully
10461362
TEST(Loop, MixedExecutionProviders) {

0 commit comments

Comments
 (0)