Skip to content

Commit f3ae653

Browse files
committed
rping: terminate CM event thread before exiting
The CM event thread processes events in a loop with no explicit termination. When the last CM event is received, the main thread proceeds to clean up and destroy the CM event channel. If this occurs after the CM event thread has processed the last event, but before it reaches rdma_get_cm_event again, then the subsequent call to rdma_get_cm_event will fail and cause the process to exit with a failure code even though the test was actually successful. This causes flakiness in test scripts that use rping for basic functional testing. Fix this by using an eventfd+poll to explicitly signal the CM event thread for termination. Tested by running 4096 parallel rping processes. Fixes: 6f640ff ("r7019: Introduce event channels.") Signed-off-by: Jacob Moroni <[email protected]>
1 parent 5abc21f commit f3ae653

File tree

1 file changed

+41
-7
lines changed

1 file changed

+41
-7
lines changed

librdmacm/examples/rping.c

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
#include <stdlib.h>
3737
#include <stdio.h>
3838
#include <errno.h>
39+
#include <unistd.h>
40+
#include <sys/poll.h>
41+
#include <sys/eventfd.h>
3942
#include <sys/types.h>
4043
#include <sys/socket.h>
4144
#include <netdb.h>
@@ -151,6 +154,7 @@ struct rping_cb {
151154
int validate; /* validate ping data */
152155

153156
/* CM stuff */
157+
int eventfd;
154158
pthread_t cmthread;
155159
struct rdma_event_channel *cm_channel;
156160
struct rdma_cm_id *cm_id; /* connection on client side,*/
@@ -666,18 +670,36 @@ static void *cm_thread(void *arg)
666670
{
667671
struct rping_cb *cb = arg;
668672
struct rdma_cm_event *event;
673+
struct pollfd pfds[2];
669674
int ret;
670675

676+
pfds[0].fd = cb->eventfd;
677+
pfds[0].events = POLLIN;
678+
pfds[1].fd = cb->cm_channel->fd;
679+
pfds[1].events = POLLIN;
680+
671681
while (1) {
672-
ret = rdma_get_cm_event(cb->cm_channel, &event);
673-
if (ret) {
674-
perror("rdma_get_cm_event");
682+
ret = poll(pfds, 2, -1);
683+
if (ret == -1 && errno != EINTR) {
684+
perror("poll failed");
675685
exit(ret);
686+
} else if (ret < 1)
687+
continue;
688+
689+
if (pfds[0].revents & POLLIN)
690+
return NULL;
691+
692+
if (pfds[1].revents & POLLIN) {
693+
ret = rdma_get_cm_event(cb->cm_channel, &event);
694+
if (ret) {
695+
perror("rdma_get_cm_event");
696+
exit(ret);
697+
}
698+
ret = rping_cma_event_handler(event->id, event);
699+
rdma_ack_cm_event(event);
700+
if (ret)
701+
exit(ret);
676702
}
677-
ret = rping_cma_event_handler(event->id, event);
678-
rdma_ack_cm_event(event);
679-
if (ret)
680-
exit(ret);
681703
}
682704
}
683705

@@ -1279,6 +1301,7 @@ int main(int argc, char *argv[])
12791301
int op;
12801302
int ret = 0;
12811303
int persistent_server = 0;
1304+
const uint64_t efdw = 1;
12821305

12831306
cb = malloc(sizeof(*cb));
12841307
if (!cb)
@@ -1366,6 +1389,13 @@ int main(int argc, char *argv[])
13661389
goto out;
13671390
}
13681391

1392+
cb->eventfd = eventfd(0, EFD_NONBLOCK);
1393+
if (cb->eventfd == -1) {
1394+
perror("Could not create event FD");
1395+
ret = errno;
1396+
goto out;
1397+
}
1398+
13691399
cb->cm_channel = create_event_channel();
13701400
if (!cb->cm_channel) {
13711401
ret = errno;
@@ -1397,6 +1427,10 @@ int main(int argc, char *argv[])
13971427
DEBUG_LOG("destroy cm_id %p\n", cb->cm_id);
13981428
rdma_destroy_id(cb->cm_id);
13991429
out2:
1430+
if (write(cb->eventfd, &efdw, sizeof(efdw)) != sizeof(efdw))
1431+
fprintf(stderr, "Failed to signal CM thread\n");
1432+
1433+
pthread_join(cb->cmthread, NULL);
14001434
rdma_destroy_event_channel(cb->cm_channel);
14011435
out:
14021436
free(cb);

0 commit comments

Comments
 (0)