openml · LennartPurucker · Oct 14, 2024 · Oct 14, 2024 · Oct 15, 2024
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -460,7 +460,7 @@ def __parse_server_exception(
         raise OpenMLServerError(f"URI too long! ({url})")
 
     try:
-        server_exception = xmltodict.parse(response.text)
+        server_exception = xmltodict.parse(response.text, strip_whitespace=False)
     except xml.parsers.expat.ExpatError as e:
         raise e
     except Exception as e:

diff --git a/openml/base.py b/openml/base.py
@@ -137,7 +137,7 @@ def publish(self) -> OpenMLBase:
             "post",
             file_elements=file_elements,
         )
-        xml_response = xmltodict.parse(response_text)
+        xml_response = xmltodict.parse(response_text, strip_whitespace=False)
 
         self._parse_publish_response(xml_response)
         return self

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -1077,7 +1077,9 @@ def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]:
 
 
 def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]:
-    xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
+    xml_dict = xmltodict.parse(
+        features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False
+    )
     features_xml = xml_dict["oml:data_features"]
 
     features: dict[int, OpenMLDataFeature] = {}
@@ -1140,6 +1142,8 @@ def _check_qualities(qualities: list[dict[str, str]]) -> dict[str, float]:
 
 
 def _parse_qualities_xml(qualities_xml: str) -> dict[str, float]:
-    xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
+    xml_as_dict = xmltodict.parse(
+        qualities_xml, force_list=("oml:quality",), strip_whitespace=False
+    )
     qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
     return _check_qualities(qualities)
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -64,7 +64,7 @@ def list_qualities() -> list[str]:
     """
     api_call = "data/qualities/list"
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
+    qualities = xmltodict.parse(xml_string, force_list=("oml:quality"), strip_whitespace=False)
     # Minimalistic check if the XML is useful
     if "oml:data_qualities_list" not in qualities:
         raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
@@ -268,7 +268,7 @@ def __list_datasets(
     output_format: Literal["dict", "dataframe"] = "dict",
 ) -> dict | pd.DataFrame:
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
+    datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",), strip_whitespace=False)
 
     # Minimalistic check if the XML is useful
     assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
@@ -876,7 +876,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non
 
     data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
     result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data)
-    result = xmltodict.parse(result_xml)
+    result = xmltodict.parse(result_xml, strip_whitespace=False)
     server_data_id = result["oml:data_status_update"]["oml:id"]
     server_status = result["oml:data_status_update"]["oml:status"]
     if status != server_status or int(data_id) != int(server_data_id):
@@ -989,7 +989,7 @@ def edit_dataset(
         data=form_data,
         file_elements=file_elements,
     )
-    result = xmltodict.parse(result_xml)
+    result = xmltodict.parse(result_xml, strip_whitespace=False)
     data_id = result["oml:data_edit"]["oml:id"]
     return int(data_id)
 
@@ -1028,7 +1028,7 @@ def fork_dataset(data_id: int) -> int:
     # compose data fork parameters
     form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
-    result = xmltodict.parse(result_xml)
+    result = xmltodict.parse(result_xml, strip_whitespace=False)
     data_id = result["oml:data_fork"]["oml:id"]
     return int(data_id)
 
@@ -1106,7 +1106,7 @@ def _topic_add_dataset(data_id: int, topic: str) -> int:
         raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
     form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data)
-    result = xmltodict.parse(result_xml)
+    result = xmltodict.parse(result_xml, strip_whitespace=False)
     data_id = result["oml:data_topic"]["oml:id"]
     return int(data_id)
 
@@ -1131,7 +1131,7 @@ def _topic_delete_dataset(data_id: int, topic: str) -> int:
         raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
     form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data)
-    result = xmltodict.parse(result_xml)
+    result = xmltodict.parse(result_xml, strip_whitespace=False)
     data_id = result["oml:data_topic"]["oml:id"]
     return int(data_id)
 
@@ -1163,12 +1163,16 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str,
     try:
         with description_file.open(encoding="utf8") as fh:
             dataset_xml = fh.read()
-        description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
+        description = xmltodict.parse(dataset_xml, strip_whitespace=False)[
+            "oml:data_set_description"
+        ]
     except Exception:  # noqa: BLE001
         url_extension = f"data/{dataset_id}"
         dataset_xml = openml._api_calls._perform_api_call(url_extension, "get")
         try:
-            description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
+            description = xmltodict.parse(dataset_xml, strip_whitespace=False)[
+                "oml:data_set_description"
+            ]
         except ExpatError as e:
             url = openml._api_calls._create_url_from_endpoint(url_extension)
             raise OpenMLServerError(f"Dataset description XML at '{url}' is malformed.") from e
@@ -1488,7 +1492,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None:
     # build a dict from the xml.
     # use the url from the dataset description and return the ARFF string
     return openml._api_calls._download_text_file(
-        xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:url"],
+        xmltodict.parse(dataset_xml, strip_whitespace=False)["oml:data_set_description"]["oml:url"],
     )
 
 
@@ -1508,7 +1512,9 @@ def _get_online_dataset_format(dataset_id: int) -> str:
     """
     dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get")
     # build a dict from the xml and get the format from the dataset description
-    return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower()  # type: ignore
+    return xmltodict.parse(dataset_xml, strip_whitespace=False)["oml:data_set_description"][  # type: ignore
+        "oml:format"
+    ].lower()
 
 
 def delete_dataset(dataset_id: int) -> bool:

diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
@@ -230,7 +230,7 @@ def __list_evaluations(
 ) -> dict | pd.DataFrame:
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",))
+    evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",), strip_whitespace=False)
     # Minimalistic check if the XML is useful
     if "oml:evaluations" not in evals_dict:
         raise ValueError(
@@ -247,7 +247,7 @@ def __list_evaluations(
     )
     api_users = "user/list/user_id/" + ",".join(uploader_ids)
     xml_string_user = openml._api_calls._perform_api_call(api_users, "get")
-    users = xmltodict.parse(xml_string_user, force_list=("oml:user",))
+    users = xmltodict.parse(xml_string_user, force_list=("oml:user",), strip_whitespace=False)
     user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]}
     for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
         run_id = int(eval_["oml:run_id"])
@@ -318,7 +318,7 @@ def list_evaluation_measures() -> list[str]:
     """
     api_call = "evaluationmeasure/list"
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
+    qualities = xmltodict.parse(xml_string, force_list=("oml:measures"), strip_whitespace=False)
     # Minimalistic check if the XML is useful
     if "oml:evaluation_measures" not in qualities:
         raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"')
@@ -339,7 +339,7 @@ def list_estimation_procedures() -> list[str]:
     """
     api_call = "estimationprocedure/list"
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    api_results = xmltodict.parse(xml_string)
+    api_results = xmltodict.parse(xml_string, strip_whitespace=False)
 
     # Minimalistic check if the XML is useful
     if "oml:estimationprocedures" not in api_results:

diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -405,7 +405,7 @@ def from_filesystem(cls, input_directory: str | Path) -> OpenMLFlow:
         input_directory = Path(input_directory) / "flow.xml"
         with input_directory.open() as f:
             xml_string = f.read()
-        return OpenMLFlow._from_dict(xmltodict.parse(xml_string))
+        return OpenMLFlow._from_dict(xmltodict.parse(xml_string, strip_whitespace=False))
 
     def _parse_publish_response(self, xml_response: dict) -> None:
         """Parse the id from the xml_response and assign it to self."""

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -312,7 +312,7 @@ def flow_exists(name: str, external_version: str) -> int | bool:
         data={"name": name, "external_version": external_version},
     )
 
-    result_dict = xmltodict.parse(xml_response)
+    result_dict = xmltodict.parse(xml_response, strip_whitespace=False)
     flow_id = int(result_dict["oml:flow_exists"]["oml:id"])
     return flow_id if flow_id > 0 else False
 
@@ -410,7 +410,7 @@ def __list_flows(
         The flows information in the specified output format.
     """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",))
+    flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",), strip_whitespace=False)
 
     # Minimalistic check if the XML is useful
     assert isinstance(flows_dict["oml:flows"]["oml:flow"], list), type(flows_dict["oml:flows"])
@@ -623,7 +623,7 @@ def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow:
     -------
     OpenMLFlow
     """
-    return OpenMLFlow._from_dict(xmltodict.parse(flow_xml))
+    return OpenMLFlow._from_dict(xmltodict.parse(flow_xml, strip_whitespace=False))
 
 
 def delete_flow(flow_id: int) -> bool:

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -890,9 +890,11 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):  # type: ignore
 
         raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname)
 
-    run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[
-        "oml:run"
-    ]
+    run = xmltodict.parse(
+        xml,
+        force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"],
+        strip_whitespace=False,
+    )["oml:run"]
     run_id = obtain_field(run, "oml:run_id", from_server, cast=int)
     uploader = obtain_field(run, "oml:uploader", from_server, cast=int)
     uploader_name = obtain_field(run, "oml:uploader_name", from_server)
@@ -1225,7 +1227,7 @@ def __list_runs(
 ) -> dict | pd.DataFrame:
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",))
+    runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",), strip_whitespace=False)
     # Minimalistic check if the XML is useful
     if "oml:runs" not in runs_dict:
         raise ValueError(f'Error in return XML, does not contain "oml:runs": {runs_dict}')

diff --git a/openml/runs/trace.py b/openml/runs/trace.py
@@ -424,7 +424,9 @@ def trace_from_xml(cls, xml: str | Path | IO) -> OpenMLRunTrace:
         if isinstance(xml, Path):
             xml = str(xml.absolute())
 
-        result_dict = xmltodict.parse(xml, force_list=("oml:trace_iteration",))["oml:trace"]
+        result_dict = xmltodict.parse(
+            xml, force_list=("oml:trace_iteration",), strip_whitespace=False
+        )["oml:trace"]
 
         run_id = result_dict["oml:run_id"]
         trace = OrderedDict()

diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -60,7 +60,7 @@ def setup_exists(flow: OpenMLFlow) -> int:
         "post",
         file_elements=file_elements,
     )
-    result_dict = xmltodict.parse(result)
+    result_dict = xmltodict.parse(result, strip_whitespace=False)
     setup_id = int(result_dict["oml:setup_exists"]["oml:id"])
     return setup_id if setup_id > 0 else False
 
@@ -88,7 +88,7 @@ def _get_cached_setup(setup_id: int) -> OpenMLSetup:
     try:
         setup_file = setup_cache_dir / "description.xml"
         with setup_file.open(encoding="utf8") as fh:
-            setup_xml = xmltodict.parse(fh.read())
+            setup_xml = xmltodict.parse(fh.read(), strip_whitespace=False)
             return _create_setup_from_xml(setup_xml, output_format="object")  # type: ignore
 
     except OSError as e:
@@ -124,7 +124,7 @@ def get_setup(setup_id: int) -> OpenMLSetup:
         with setup_file.open("w", encoding="utf8") as fh:
             fh.write(setup_xml)
 
-    result_dict = xmltodict.parse(setup_xml)
+    result_dict = xmltodict.parse(setup_xml, strip_whitespace=False)
     return _create_setup_from_xml(result_dict, output_format="object")  # type: ignore
 
 
@@ -225,7 +225,7 @@ def __list_setups(
 ) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]:
     """Helper function to parse API calls which are lists of setups"""
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",))
+    setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",), strip_whitespace=False)
     openml_uri = "http://openml.org/openml"
     # Minimalistic check if the XML is useful
     if "oml:setups" not in setups_dict:

diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -88,7 +88,9 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy:  # noqa: C901
         "oml:run_id",
         "oml:tag",  # legacy.
     )
-    result_dict = xmltodict.parse(xml_string, force_list=force_list_tags)["oml:study"]
+    result_dict = xmltodict.parse(xml_string, force_list=force_list_tags, strip_whitespace=False)[
+        "oml:study"
+    ]
     study_id = int(result_dict["oml:id"])
     alias = result_dict.get("oml:alias", None)
     main_entity_type = result_dict["oml:main_entity_type"]
@@ -307,7 +309,7 @@ def update_study_status(study_id: int, status: str) -> None:
         raise ValueError("Illegal status value. " f"Legal values: {legal_status}")
     data = {"study_id": study_id, "status": status}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data)
-    result = xmltodict.parse(result_xml)
+    result = xmltodict.parse(result_xml, strip_whitespace=False)
     server_study_id = result["oml:study_status_update"]["oml:id"]
     server_status = result["oml:study_status_update"]["oml:status"]
     if status != server_status or int(study_id) != int(server_study_id):
@@ -388,7 +390,7 @@ def attach_to_study(study_id: int, run_ids: list[int]) -> int:
         request_method="post",
         data={"ids": ",".join(str(x) for x in run_ids)},
     )
-    result = xmltodict.parse(result_xml)["oml:study_attach"]
+    result = xmltodict.parse(result_xml, strip_whitespace=False)["oml:study_attach"]
     return int(result["oml:linked_entities"])
 
 
@@ -435,7 +437,7 @@ def detach_from_study(study_id: int, run_ids: list[int]) -> int:
         request_method="post",
         data=post_variables,
     )
-    result = xmltodict.parse(result_xml)["oml:study_detach"]
+    result = xmltodict.parse(result_xml, strip_whitespace=False)["oml:study_detach"]
     return int(result["oml:linked_entities"])
 
 
@@ -700,7 +702,7 @@ def __list_studies(
         depending on the value of 'output_format'.
     """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))
+    study_dict = xmltodict.parse(xml_string, force_list=("oml:study",), strip_whitespace=False)
 
     # Minimalistic check if the XML is useful
     assert isinstance(study_dict["oml:study_list"]["oml:study"], list), type(

diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -83,7 +83,7 @@ def _get_estimation_procedure_list() -> list[dict[str, Any]]:
     url_suffix = "estimationprocedure/list"
     xml_string = openml._api_calls._perform_api_call(url_suffix, "get")
 
-    procs_dict = xmltodict.parse(xml_string)
+    procs_dict = xmltodict.parse(xml_string, strip_whitespace=False)
     # Minimalistic check if the XML is useful
     if "oml:estimationprocedures" not in procs_dict:
         raise ValueError("Error in return XML, does not contain tag oml:estimationprocedures.")
@@ -263,7 +263,9 @@ def __list_tasks(  # noqa: PLR0912, C901
         If an invalid key is found in the XML for a task.
     """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
+    tasks_dict = xmltodict.parse(
+        xml_string, force_list=("oml:task", "oml:input"), strip_whitespace=False
+    )
     # Minimalistic check if the XML is useful
     if "oml:tasks" not in tasks_dict:
         raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')
@@ -468,7 +470,7 @@ def _create_task_from_xml(xml: str) -> OpenMLTask:
     -------
     OpenMLTask
     """
-    dic = xmltodict.parse(xml)["oml:task"]
+    dic = xmltodict.parse(xml, strip_whitespace=False)["oml:task"]
     estimation_parameters = {}
     inputs = {}
     # Due to the unordered structure we obtain, we first have to extract

diff --git a/openml/utils.py b/openml/utils.py
@@ -157,7 +157,7 @@ def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = Fal
         {f"{entity_type}_id": entity_id, "tag": tag},
     )
 
-    result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag]
+    result = xmltodict.parse(result_xml, force_list={"oml:tag"}, strip_whitespace=False)[main_tag]
 
     if "oml:tag" in result:
         return result["oml:tag"]  # type: ignore
@@ -201,7 +201,7 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool:
     url_suffix = "%s/%d" % (entity_type, entity_id)
     try:
         result_xml = openml._api_calls._perform_api_call(url_suffix, "delete")
-        result = xmltodict.parse(result_xml)
+        result = xmltodict.parse(result_xml, strip_whitespace=False)
         return f"oml:{entity_type}_delete" in result
     except openml.exceptions.OpenMLServerException as e:
         # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php