Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Strip Whitespace New PR #1363

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ def __parse_server_exception(
raise OpenMLServerError(f"URI too long! ({url})")

try:
server_exception = xmltodict.parse(response.text)
server_exception = xmltodict.parse(response.text, strip_whitespace=False)
except xml.parsers.expat.ExpatError as e:
raise e
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion openml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def publish(self) -> OpenMLBase:
"post",
file_elements=file_elements,
)
xml_response = xmltodict.parse(response_text)
xml_response = xmltodict.parse(response_text, strip_whitespace=False)

self._parse_publish_response(xml_response)
return self
Expand Down
8 changes: 6 additions & 2 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,7 +1077,9 @@ def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]:


def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]:
xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
xml_dict = xmltodict.parse(
features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False
)
features_xml = xml_dict["oml:data_features"]

features: dict[int, OpenMLDataFeature] = {}
Expand Down Expand Up @@ -1140,6 +1142,8 @@ def _check_qualities(qualities: list[dict[str, str]]) -> dict[str, float]:


def _parse_qualities_xml(qualities_xml: str) -> dict[str, float]:
xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
xml_as_dict = xmltodict.parse(
qualities_xml, force_list=("oml:quality",), strip_whitespace=False
)
qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
return _check_qualities(qualities)
28 changes: 17 additions & 11 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def list_qualities() -> list[str]:
"""
api_call = "data/qualities/list"
xml_string = openml._api_calls._perform_api_call(api_call, "get")
qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
qualities = xmltodict.parse(xml_string, force_list=("oml:quality"), strip_whitespace=False)
# Minimalistic check if the XML is useful
if "oml:data_qualities_list" not in qualities:
raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
Expand Down Expand Up @@ -268,7 +268,7 @@ def __list_datasets(
output_format: Literal["dict", "dataframe"] = "dict",
) -> dict | pd.DataFrame:
xml_string = openml._api_calls._perform_api_call(api_call, "get")
datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",), strip_whitespace=False)

# Minimalistic check if the XML is useful
assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
Expand Down Expand Up @@ -876,7 +876,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non

data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
server_data_id = result["oml:data_status_update"]["oml:id"]
server_status = result["oml:data_status_update"]["oml:status"]
if status != server_status or int(data_id) != int(server_data_id):
Expand Down Expand Up @@ -989,7 +989,7 @@ def edit_dataset(
data=form_data,
file_elements=file_elements,
)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
data_id = result["oml:data_edit"]["oml:id"]
return int(data_id)

Expand Down Expand Up @@ -1028,7 +1028,7 @@ def fork_dataset(data_id: int) -> int:
# compose data fork parameters
form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE
result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
data_id = result["oml:data_fork"]["oml:id"]
return int(data_id)

Expand Down Expand Up @@ -1106,7 +1106,7 @@ def _topic_add_dataset(data_id: int, topic: str) -> int:
raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE
result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
data_id = result["oml:data_topic"]["oml:id"]
return int(data_id)

Expand All @@ -1131,7 +1131,7 @@ def _topic_delete_dataset(data_id: int, topic: str) -> int:
raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE
result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
data_id = result["oml:data_topic"]["oml:id"]
return int(data_id)

Expand Down Expand Up @@ -1163,12 +1163,16 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str,
try:
with description_file.open(encoding="utf8") as fh:
dataset_xml = fh.read()
description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
description = xmltodict.parse(dataset_xml, strip_whitespace=False)[
"oml:data_set_description"
]
except Exception: # noqa: BLE001
url_extension = f"data/{dataset_id}"
dataset_xml = openml._api_calls._perform_api_call(url_extension, "get")
try:
description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
description = xmltodict.parse(dataset_xml, strip_whitespace=False)[
"oml:data_set_description"
]
except ExpatError as e:
url = openml._api_calls._create_url_from_endpoint(url_extension)
raise OpenMLServerError(f"Dataset description XML at '{url}' is malformed.") from e
Expand Down Expand Up @@ -1488,7 +1492,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None:
# build a dict from the xml.
# use the url from the dataset description and return the ARFF string
return openml._api_calls._download_text_file(
xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:url"],
xmltodict.parse(dataset_xml, strip_whitespace=False)["oml:data_set_description"]["oml:url"],
)


Expand All @@ -1508,7 +1512,9 @@ def _get_online_dataset_format(dataset_id: int) -> str:
"""
dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get")
# build a dict from the xml and get the format from the dataset description
return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower() # type: ignore
return xmltodict.parse(dataset_xml, strip_whitespace=False)["oml:data_set_description"][ # type: ignore
"oml:format"
].lower()


def delete_dataset(dataset_id: int) -> bool:
Expand Down
8 changes: 4 additions & 4 deletions openml/evaluations/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def __list_evaluations(
) -> dict | pd.DataFrame:
"""Helper function to parse API calls which are lists of runs"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",))
evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",), strip_whitespace=False)
# Minimalistic check if the XML is useful
if "oml:evaluations" not in evals_dict:
raise ValueError(
Expand All @@ -247,7 +247,7 @@ def __list_evaluations(
)
api_users = "user/list/user_id/" + ",".join(uploader_ids)
xml_string_user = openml._api_calls._perform_api_call(api_users, "get")
users = xmltodict.parse(xml_string_user, force_list=("oml:user",))
users = xmltodict.parse(xml_string_user, force_list=("oml:user",), strip_whitespace=False)
user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]}
for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
run_id = int(eval_["oml:run_id"])
Expand Down Expand Up @@ -318,7 +318,7 @@ def list_evaluation_measures() -> list[str]:
"""
api_call = "evaluationmeasure/list"
xml_string = openml._api_calls._perform_api_call(api_call, "get")
qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
qualities = xmltodict.parse(xml_string, force_list=("oml:measures"), strip_whitespace=False)
# Minimalistic check if the XML is useful
if "oml:evaluation_measures" not in qualities:
raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"')
Expand All @@ -339,7 +339,7 @@ def list_estimation_procedures() -> list[str]:
"""
api_call = "estimationprocedure/list"
xml_string = openml._api_calls._perform_api_call(api_call, "get")
api_results = xmltodict.parse(xml_string)
api_results = xmltodict.parse(xml_string, strip_whitespace=False)

# Minimalistic check if the XML is useful
if "oml:estimationprocedures" not in api_results:
Expand Down
2 changes: 1 addition & 1 deletion openml/flows/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def from_filesystem(cls, input_directory: str | Path) -> OpenMLFlow:
input_directory = Path(input_directory) / "flow.xml"
with input_directory.open() as f:
xml_string = f.read()
return OpenMLFlow._from_dict(xmltodict.parse(xml_string))
return OpenMLFlow._from_dict(xmltodict.parse(xml_string, strip_whitespace=False))

def _parse_publish_response(self, xml_response: dict) -> None:
"""Parse the id from the xml_response and assign it to self."""
Expand Down
6 changes: 3 additions & 3 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def flow_exists(name: str, external_version: str) -> int | bool:
data={"name": name, "external_version": external_version},
)

result_dict = xmltodict.parse(xml_response)
result_dict = xmltodict.parse(xml_response, strip_whitespace=False)
flow_id = int(result_dict["oml:flow_exists"]["oml:id"])
return flow_id if flow_id > 0 else False

Expand Down Expand Up @@ -410,7 +410,7 @@ def __list_flows(
The flows information in the specified output format.
"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",))
flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",), strip_whitespace=False)

# Minimalistic check if the XML is useful
assert isinstance(flows_dict["oml:flows"]["oml:flow"], list), type(flows_dict["oml:flows"])
Expand Down Expand Up @@ -623,7 +623,7 @@ def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow:
-------
OpenMLFlow
"""
return OpenMLFlow._from_dict(xmltodict.parse(flow_xml))
return OpenMLFlow._from_dict(xmltodict.parse(flow_xml, strip_whitespace=False))


def delete_flow(flow_id: int) -> bool:
Expand Down
10 changes: 6 additions & 4 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,9 +890,11 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore

raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname)

run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[
"oml:run"
]
run = xmltodict.parse(
xml,
force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"],
strip_whitespace=False,
)["oml:run"]
run_id = obtain_field(run, "oml:run_id", from_server, cast=int)
uploader = obtain_field(run, "oml:uploader", from_server, cast=int)
uploader_name = obtain_field(run, "oml:uploader_name", from_server)
Expand Down Expand Up @@ -1225,7 +1227,7 @@ def __list_runs(
) -> dict | pd.DataFrame:
"""Helper function to parse API calls which are lists of runs"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",))
runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",), strip_whitespace=False)
# Minimalistic check if the XML is useful
if "oml:runs" not in runs_dict:
raise ValueError(f'Error in return XML, does not contain "oml:runs": {runs_dict}')
Expand Down
4 changes: 3 additions & 1 deletion openml/runs/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,9 @@ def trace_from_xml(cls, xml: str | Path | IO) -> OpenMLRunTrace:
if isinstance(xml, Path):
xml = str(xml.absolute())

result_dict = xmltodict.parse(xml, force_list=("oml:trace_iteration",))["oml:trace"]
result_dict = xmltodict.parse(
xml, force_list=("oml:trace_iteration",), strip_whitespace=False
)["oml:trace"]

run_id = result_dict["oml:run_id"]
trace = OrderedDict()
Expand Down
8 changes: 4 additions & 4 deletions openml/setups/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def setup_exists(flow: OpenMLFlow) -> int:
"post",
file_elements=file_elements,
)
result_dict = xmltodict.parse(result)
result_dict = xmltodict.parse(result, strip_whitespace=False)
setup_id = int(result_dict["oml:setup_exists"]["oml:id"])
return setup_id if setup_id > 0 else False

Expand Down Expand Up @@ -88,7 +88,7 @@ def _get_cached_setup(setup_id: int) -> OpenMLSetup:
try:
setup_file = setup_cache_dir / "description.xml"
with setup_file.open(encoding="utf8") as fh:
setup_xml = xmltodict.parse(fh.read())
setup_xml = xmltodict.parse(fh.read(), strip_whitespace=False)
return _create_setup_from_xml(setup_xml, output_format="object") # type: ignore

except OSError as e:
Expand Down Expand Up @@ -124,7 +124,7 @@ def get_setup(setup_id: int) -> OpenMLSetup:
with setup_file.open("w", encoding="utf8") as fh:
fh.write(setup_xml)

result_dict = xmltodict.parse(setup_xml)
result_dict = xmltodict.parse(setup_xml, strip_whitespace=False)
return _create_setup_from_xml(result_dict, output_format="object") # type: ignore


Expand Down Expand Up @@ -225,7 +225,7 @@ def __list_setups(
) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]:
"""Helper function to parse API calls which are lists of setups"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",))
setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",), strip_whitespace=False)
openml_uri = "http://openml.org/openml"
# Minimalistic check if the XML is useful
if "oml:setups" not in setups_dict:
Expand Down
12 changes: 7 additions & 5 deletions openml/study/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: # noqa: C901
"oml:run_id",
"oml:tag", # legacy.
)
result_dict = xmltodict.parse(xml_string, force_list=force_list_tags)["oml:study"]
result_dict = xmltodict.parse(xml_string, force_list=force_list_tags, strip_whitespace=False)[
"oml:study"
]
study_id = int(result_dict["oml:id"])
alias = result_dict.get("oml:alias", None)
main_entity_type = result_dict["oml:main_entity_type"]
Expand Down Expand Up @@ -307,7 +309,7 @@ def update_study_status(study_id: int, status: str) -> None:
raise ValueError("Illegal status value. " f"Legal values: {legal_status}")
data = {"study_id": study_id, "status": status} # type: openml._api_calls.DATA_TYPE
result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
server_study_id = result["oml:study_status_update"]["oml:id"]
server_status = result["oml:study_status_update"]["oml:status"]
if status != server_status or int(study_id) != int(server_study_id):
Expand Down Expand Up @@ -388,7 +390,7 @@ def attach_to_study(study_id: int, run_ids: list[int]) -> int:
request_method="post",
data={"ids": ",".join(str(x) for x in run_ids)},
)
result = xmltodict.parse(result_xml)["oml:study_attach"]
result = xmltodict.parse(result_xml, strip_whitespace=False)["oml:study_attach"]
return int(result["oml:linked_entities"])


Expand Down Expand Up @@ -435,7 +437,7 @@ def detach_from_study(study_id: int, run_ids: list[int]) -> int:
request_method="post",
data=post_variables,
)
result = xmltodict.parse(result_xml)["oml:study_detach"]
result = xmltodict.parse(result_xml, strip_whitespace=False)["oml:study_detach"]
return int(result["oml:linked_entities"])


Expand Down Expand Up @@ -700,7 +702,7 @@ def __list_studies(
depending on the value of 'output_format'.
"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))
study_dict = xmltodict.parse(xml_string, force_list=("oml:study",), strip_whitespace=False)

# Minimalistic check if the XML is useful
assert isinstance(study_dict["oml:study_list"]["oml:study"], list), type(
Expand Down
8 changes: 5 additions & 3 deletions openml/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _get_estimation_procedure_list() -> list[dict[str, Any]]:
url_suffix = "estimationprocedure/list"
xml_string = openml._api_calls._perform_api_call(url_suffix, "get")

procs_dict = xmltodict.parse(xml_string)
procs_dict = xmltodict.parse(xml_string, strip_whitespace=False)
# Minimalistic check if the XML is useful
if "oml:estimationprocedures" not in procs_dict:
raise ValueError("Error in return XML, does not contain tag oml:estimationprocedures.")
Expand Down Expand Up @@ -263,7 +263,9 @@ def __list_tasks( # noqa: PLR0912, C901
If an invalid key is found in the XML for a task.
"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
tasks_dict = xmltodict.parse(
xml_string, force_list=("oml:task", "oml:input"), strip_whitespace=False
)
# Minimalistic check if the XML is useful
if "oml:tasks" not in tasks_dict:
raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')
Expand Down Expand Up @@ -468,7 +470,7 @@ def _create_task_from_xml(xml: str) -> OpenMLTask:
-------
OpenMLTask
"""
dic = xmltodict.parse(xml)["oml:task"]
dic = xmltodict.parse(xml, strip_whitespace=False)["oml:task"]
estimation_parameters = {}
inputs = {}
# Due to the unordered structure we obtain, we first have to extract
Expand Down
4 changes: 2 additions & 2 deletions openml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = Fal
{f"{entity_type}_id": entity_id, "tag": tag},
)

result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag]
result = xmltodict.parse(result_xml, force_list={"oml:tag"}, strip_whitespace=False)[main_tag]

if "oml:tag" in result:
return result["oml:tag"] # type: ignore
Expand Down Expand Up @@ -201,7 +201,7 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool:
url_suffix = "%s/%d" % (entity_type, entity_id)
try:
result_xml = openml._api_calls._perform_api_call(url_suffix, "delete")
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
return f"oml:{entity_type}_delete" in result
except openml.exceptions.OpenMLServerException as e:
# https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php
Expand Down
Loading
Loading