From 800a2982407d1bddb9b35db154b5c151ac74645e Mon Sep 17 00:00:00 2001 From: Rommel Anatoli Quintanilla Cruz Date: Thu, 8 Oct 2020 14:20:59 -0500 Subject: [PATCH 1/6] Fix for empty folders --- engine/src/io/data_provider/UriDataProvider.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/engine/src/io/data_provider/UriDataProvider.cpp b/engine/src/io/data_provider/UriDataProvider.cpp index 68eb15e85..f21f8cd0e 100644 --- a/engine/src/io/data_provider/UriDataProvider.cpp +++ b/engine/src/io/data_provider/UriDataProvider.cpp @@ -160,8 +160,22 @@ data_handle uri_data_provider::get_next(bool open_file) { } } this->directory_uris = new_uris; - this->directory_current_file = 0; + + // If this->directory_uris is empty, + // the folder is empty, we just skip it + if(this->directory_uris.size()==0){ + this->current_file++; + + auto logger = spdlog::get("batch_logger"); + if(logger != nullptr) { + logger->warn("|||{info}|||||", "info"_a="Folder is empty"); + } + + data_handle empty_handle; + return empty_handle; + } + return get_next(open_file); } else if(fileStatus.isFile()) { From a674c0013eae9b8d01ac5555af7bd433a1794757 Mon Sep 17 00:00:00 2001 From: Rommel Anatoli Quintanilla Cruz Date: Thu, 8 Oct 2020 14:24:02 -0500 Subject: [PATCH 2/6] Edit Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d72435d57..cb42d446d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ - #1053 Fixed issue when loading paths with wildcards - #1057 Fixed issue with concat all in concatenating cache - #1007 Fix arrow and spdlog compilation issues +- #1071 Fix crash when loading an empty folder From d39ff09657757f266c2ea984ab5565757511224e Mon Sep 17 00:00:00 2001 From: Rommel Anatoli Quintanilla Cruz Date: Fri, 9 Oct 2020 00:22:18 -0500 Subject: [PATCH 3/6] Unifying ignoring files routines --- engine/src/io/data_provider/DataProvider.h | 6 ------ engine/src/io/data_provider/UriDataProvider.cpp | 17 +++++++++++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/engine/src/io/data_provider/DataProvider.h b/engine/src/io/data_provider/DataProvider.h index 992a29ff4..f94591e33 100644 --- a/engine/src/io/data_provider/DataProvider.h +++ b/engine/src/io/data_provider/DataProvider.h @@ -36,12 +36,6 @@ struct data_handle { Uri uri; // in case the data was loaded from a file bool is_valid(){ - // sometimes parquet directories have a `_metadata` file that have not the same schema as the *.parquet files - // we don't want the data provider handle this one. - std::string file_name = uri.toString(true); - std::string metadata_str = file_name.substr(file_name.size() - 9); - if (metadata_str == "_metadata") return false; - return fileHandle != nullptr || !uri.isEmpty() ; } }; diff --git a/engine/src/io/data_provider/UriDataProvider.cpp b/engine/src/io/data_provider/UriDataProvider.cpp index f21f8cd0e..e602fa875 100644 --- a/engine/src/io/data_provider/UriDataProvider.cpp +++ b/engine/src/io/data_provider/UriDataProvider.cpp @@ -150,15 +150,28 @@ data_handle uri_data_provider::get_next(bool open_file) { this->directory_uris = BlazingContext::getInstance()->getFileSystemManager()->list(target_uri); } - std::string ender = ".crc"; + // sometimes parquet directories have somes files that + // have not the same schema as the *.parquet files + // we don't want the data provider handle this ones + std::vector ignored_suffixes{".crc", "_metadata", "_SUCCESS"}; + std::vector new_uris; for(int i = 0; i < this->directory_uris.size(); i++) { std::string fileName = this->directory_uris[i].getPath().toString(); - if(!StringUtil::endsWith(fileName, ender)) { + bool is_valid=true; + for(std::string ender : ignored_suffixes) { + if(StringUtil::endsWith(fileName, ender)) { + is_valid=false; + break; + } + } + + if(is_valid){ new_uris.push_back(this->directory_uris[i]); } } + this->directory_uris = new_uris; this->directory_current_file = 0; From 32d78aecfd6fd5b3a5514b974d9d7e8a07f6f54f Mon Sep 17 00:00:00 2001 From: Rommel Anatoli Quintanilla Cruz Date: Fri, 9 Oct 2020 04:00:17 -0500 Subject: [PATCH 4/6] Adding some unit tests for UriDataProvider --- engine/tests/CMakeLists.txt | 1 + engine/tests/provider/CMakeLists.txt | 5 ++ engine/tests/provider/provider_test.cpp | 65 +++++++++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 engine/tests/provider/CMakeLists.txt create mode 100644 engine/tests/provider/provider_test.cpp diff --git a/engine/tests/CMakeLists.txt b/engine/tests/CMakeLists.txt index 17fa7e23d..850619341 100644 --- a/engine/tests/CMakeLists.txt +++ b/engine/tests/CMakeLists.txt @@ -86,5 +86,6 @@ add_subdirectory(skipdata) add_subdirectory(sort) add_subdirectory(waiting_queue) add_subdirectory(kernel_tests) +add_subdirectory(provider) message(STATUS "******** Tests are ready ********") diff --git a/engine/tests/provider/CMakeLists.txt b/engine/tests/provider/CMakeLists.txt new file mode 100644 index 000000000..b78f55cd0 --- /dev/null +++ b/engine/tests/provider/CMakeLists.txt @@ -0,0 +1,5 @@ +set(provider_sources + provider_test.cpp +) + +configure_test(provider_test "${provider_sources}") \ No newline at end of file diff --git a/engine/tests/provider/provider_test.cpp b/engine/tests/provider/provider_test.cpp new file mode 100644 index 000000000..c06cfb12c --- /dev/null +++ b/engine/tests/provider/provider_test.cpp @@ -0,0 +1,65 @@ +#include +#include "tests/utilities/BlazingUnitTest.h" +#include "io/data_provider/UriDataProvider.h" + +struct ProviderTest : public BlazingUnitTest {}; + +TEST_F(ProviderTest, non_existent_directory) { + + std::string filename = "/fake/"; + std::vector uris; + uris.push_back(Uri{filename}); + + auto provider = std::make_shared(uris); + + bool open_file = false; + if(provider->has_next()){ + try{ + ral::io::data_handle new_handle = provider->get_next(open_file); + FAIL(); + } + catch(std::runtime_error e){ + SUCCEED(); + } + catch(std::exception e){ + FAIL(); + } + catch(...){ + FAIL(); + } + } +} + +void create_dummy_file(std::string content, std::string filename){ + std::ofstream outfile(filename, std::ofstream::out); + outfile << content << std::endl; + outfile.close(); +} + +TEST_F(ProviderTest, ignoring_dummy_files) { + + std::vector test_files = {"/tmp/file.crc", "/tmp/file_SUCCESS", "/tmp/file_metadata", "/tmp/file.csv"}; + + create_dummy_file("some crc", test_files[0]); + create_dummy_file("some flag", test_files[1]); + create_dummy_file("some meta", test_files[2]); + create_dummy_file("a|b\n0|0", test_files[3]); + + std::vector uris; + uris.push_back(Uri{"/tmp/file*"}); + + auto provider = std::make_shared(uris); + + bool open_file = false; + + std::vector result; + + while(provider->has_next()){ + ral::io::data_handle new_handle = provider->get_next(open_file); + std::string file_name = new_handle.uri.toString(true); + result.push_back(file_name); + } + + EXPECT_EQ(result.size(), 1); + EXPECT_EQ(result[0], "/tmp/file.csv"); +} \ No newline at end of file From de9bc95d4ae0fb1655cee0337b0ff1bf6b27c43f Mon Sep 17 00:00:00 2001 From: Rommel Anatoli Quintanilla Cruz Date: Fri, 9 Oct 2020 12:12:46 -0500 Subject: [PATCH 5/6] Adding test for empty dirs --- engine/tests/provider/provider_test.cpp | 37 +++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/engine/tests/provider/provider_test.cpp b/engine/tests/provider/provider_test.cpp index c06cfb12c..6636090de 100644 --- a/engine/tests/provider/provider_test.cpp +++ b/engine/tests/provider/provider_test.cpp @@ -1,14 +1,15 @@ #include #include "tests/utilities/BlazingUnitTest.h" #include "io/data_provider/UriDataProvider.h" +#include "FileSystem/LocalFileSystem.h" +#include "Util/StringUtil.h" struct ProviderTest : public BlazingUnitTest {}; TEST_F(ProviderTest, non_existent_directory) { std::string filename = "/fake/"; - std::vector uris; - uris.push_back(Uri{filename}); + std::vector uris = {Uri{filename}}; auto provider = std::make_shared(uris); @@ -45,8 +46,7 @@ TEST_F(ProviderTest, ignoring_dummy_files) { create_dummy_file("some meta", test_files[2]); create_dummy_file("a|b\n0|0", test_files[3]); - std::vector uris; - uris.push_back(Uri{"/tmp/file*"}); + std::vector uris = {Uri{"/tmp/file*"}}; auto provider = std::make_shared(uris); @@ -62,4 +62,31 @@ TEST_F(ProviderTest, ignoring_dummy_files) { EXPECT_EQ(result.size(), 1); EXPECT_EQ(result[0], "/tmp/file.csv"); -} \ No newline at end of file +} + +TEST_F(ProviderTest, empty_dir) { + + std::unique_ptr localFileSystem(new LocalFileSystem(Path("/"))); + + const int length = 10; + std::string dirname = "/tmp/" + randomString(length); + + bool dir_create_ok = localFileSystem->makeDirectory(Uri{dirname}); + ASSERT_TRUE(dir_create_ok); + + std::vector uris = {Uri{dirname}}; + auto provider = std::make_shared(uris); + + bool open_file = false; + + std::vector result; + + if(provider->has_next()){ + ral::io::data_handle new_handle = provider->get_next(open_file); + // an empty folder must return an empty handle + EXPECT_EQ(new_handle.uri.isEmpty(), true); + } + + bool dir_remove_ok = localFileSystem->remove(Uri{dirname}); + ASSERT_TRUE(dir_remove_ok); +} From 68e320d1421b43ba8f533b4394f6c853298bf42f Mon Sep 17 00:00:00 2001 From: Rommel Anatoli Quintanilla Cruz Date: Fri, 9 Oct 2020 15:47:18 -0500 Subject: [PATCH 6/6] Ignoring ipython checkpointing files --- engine/src/io/data_provider/UriDataProvider.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/engine/src/io/data_provider/UriDataProvider.cpp b/engine/src/io/data_provider/UriDataProvider.cpp index e602fa875..5e5cfbc83 100644 --- a/engine/src/io/data_provider/UriDataProvider.cpp +++ b/engine/src/io/data_provider/UriDataProvider.cpp @@ -153,7 +153,12 @@ data_handle uri_data_provider::get_next(bool open_file) { // sometimes parquet directories have somes files that // have not the same schema as the *.parquet files // we don't want the data provider handle this ones - std::vector ignored_suffixes{".crc", "_metadata", "_SUCCESS"}; + std::vector ignored_suffixes { + ".crc", + "_metadata", + "_SUCCESS", + ".ipynb_checkpoints" + }; std::vector new_uris; for(int i = 0; i < this->directory_uris.size(); i++) {