|
13 | 13 | See the License for the specific language governing permissions and |
14 | 14 | limitations under the License. |
15 | 15 | */ |
| 16 | +#include <dlfcn.h> |
| 17 | +#include <libgen.h> |
16 | 18 | #include <sys/stat.h> |
17 | 19 |
|
18 | 20 | #include "common/api_wrapper/api_wrapper.hpp" |
| 21 | +#include "common/log/log.hpp" |
19 | 22 | #include "common/api_wrapper/ofi_api_wrapper.hpp" |
20 | 23 |
|
21 | 24 | namespace ccl { |
@@ -64,20 +67,105 @@ std::string get_ofi_lib_path() { |
64 | 67 | return ofi_lib_path; |
65 | 68 | } |
66 | 69 |
|
67 | | -bool ofi_api_init() { |
68 | | - bool ret = true; |
| 70 | +static std::string get_relative_ccl_root_path() { |
| 71 | + Dl_info info; |
| 72 | + |
| 73 | + if (dladdr((void*)ccl::get_library_version, &info)) { |
| 74 | + char libccl_path[PATH_MAX]; |
| 75 | + |
| 76 | + if (realpath(info.dli_fname, libccl_path) != nullptr) { |
| 77 | + // We have to use `realpath`, so the dirname will work correctly, |
| 78 | + // because if there's any symlink like `..` in the path it will not work |
| 79 | + libccl_path[PATH_MAX - 1] = '\0'; |
| 80 | + |
| 81 | + // Remove `libccl.so` from the path to get directory like `$CCL_ROOT/lib` |
| 82 | + char* libccl_dir = dirname(libccl_path); |
| 83 | + // Remove the `lib` from the path the get just the `CCL_ROOT` |
| 84 | + char* ccl_root_cstr = dirname(libccl_dir); |
| 85 | + |
| 86 | + auto ccl_root = std::string(ccl_root_cstr); |
69 | 87 |
|
| 88 | + return ccl_root; |
| 89 | + } |
| 90 | + } |
| 91 | + |
| 92 | + return {}; |
| 93 | +} |
| 94 | + |
| 95 | +static bool load_libfabric() { |
70 | 96 | ofi_lib_info.ops = &ofi_lib_ops; |
71 | 97 | ofi_lib_info.fn_names = ofi_fn_names; |
72 | 98 | ofi_lib_info.path = get_ofi_lib_path(); |
73 | 99 |
|
74 | 100 | int error = load_library(ofi_lib_info); |
75 | | - if (error != CCL_LOAD_LB_SUCCESS) { |
76 | | - print_error(error, ofi_lib_info); |
77 | | - ret = false; |
| 101 | + if (error == CCL_LOAD_LB_SUCCESS) { |
| 102 | + return true; |
| 103 | + } |
| 104 | + |
| 105 | + print_error(error, ofi_lib_info); |
| 106 | + LOG_INFO("Retrying to load libfabric.so using relative path"); |
| 107 | + |
| 108 | + auto realtive_root = get_relative_ccl_root_path(); |
| 109 | + if (realtive_root.empty()) { |
| 110 | + return false; |
| 111 | + } |
| 112 | + |
| 113 | + // Path up to IMPI 2021.14 |
| 114 | + ofi_lib_info.path = realtive_root + "/lib/libfabric/libfabric.so"; |
| 115 | + error = load_library(ofi_lib_info); |
| 116 | + if (error == CCL_LOAD_LB_SUCCESS) { |
| 117 | + return true; |
| 118 | + } |
| 119 | + |
| 120 | + // Path in IMPI 2021.15 |
| 121 | + ofi_lib_info.path = realtive_root + "/lib/libfabric.so"; |
| 122 | + error = load_library(ofi_lib_info); |
| 123 | + if (error == CCL_LOAD_LB_SUCCESS) { |
| 124 | + return true; |
| 125 | + } |
| 126 | + |
| 127 | + print_error(error, ofi_lib_info); |
| 128 | + return false; |
| 129 | +} |
| 130 | + |
| 131 | +static void setup_providers() { |
| 132 | + const char* fi_provider_path = getenv("FI_PROVIDER_PATH"); |
| 133 | + if (fi_provider_path != nullptr) { |
| 134 | + LOG_DEBUG("FI_PROVIDER_PATH is already set to: ", fi_provider_path); |
| 135 | + return; |
| 136 | + } |
| 137 | + |
| 138 | + char libfabric_path[PATH_MAX]; |
| 139 | + dlinfo(ofi_lib_info.handle, RTLD_DI_ORIGIN, &libfabric_path); |
| 140 | + |
| 141 | + // Add realpath to resolve any symlinks and get the absolute path |
| 142 | + char real_libfabric_path[PATH_MAX]; |
| 143 | + if (!realpath(libfabric_path, real_libfabric_path)) { |
| 144 | + LOG_ERROR("Failed to resolve libfabric realpath: ", strerror(errno)); |
| 145 | + return; |
| 146 | + } |
| 147 | + |
| 148 | + std::string primary_path = std::string(real_libfabric_path); |
| 149 | + std::string secondary_path = primary_path + "/prov"; |
| 150 | + |
| 151 | + // Construct the full provider path with colon separator |
| 152 | + std::string full_provider_path = primary_path + ":" + secondary_path; |
| 153 | + |
| 154 | + if (setenv("FI_PROVIDER_PATH", full_provider_path.c_str(), 1) != 0) { |
| 155 | + LOG_ERROR("Failed to set FI_PROVIDER_PATH with error: ", strerror(errno)); |
| 156 | + return; |
| 157 | + } |
| 158 | + |
| 159 | + LOG_DEBUG("FI_PROVIDER_PATH set to: ", full_provider_path); |
| 160 | +} |
| 161 | + |
| 162 | +bool ofi_api_init() { |
| 163 | + if (load_libfabric() == false) { |
| 164 | + return false; |
78 | 165 | } |
79 | 166 |
|
80 | | - return ret; |
| 167 | + setup_providers(); |
| 168 | + return true; |
81 | 169 | } |
82 | 170 |
|
83 | 171 | void ofi_api_fini() { |
|
0 commit comments