Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve binance caching #165

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# current

- Add: `resample_dataframe` utility function. Useful for resampling dataframes with multiple columns that are not candlestick data
- Fix: Improve Binance data caching to avoid unnecessary downloads

# 0.22.14

Expand Down
83 changes: 69 additions & 14 deletions tradingstrategy/binance/downloader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Get candlestick price and volume data from Binance.

"""

import re
import requests
import datetime
import pandas as pd
Expand Down Expand Up @@ -129,9 +129,7 @@ def fetch_candlestick_data(
)
dataframes.append(df)

# Count the cached file size
path = self.get_parquet_path(symbol, time_bucket, start_at, end_at)
total_size += os.path.getsize(path)
total_size += self._get_parquet_size(df, symbol, time_bucket, start_at, end_at)

if progress_bar:
progress_bar.set_postfix(
Expand Down Expand Up @@ -361,11 +359,7 @@ def fetch_lending_rates(
)
dataframes.append(df)

# Count the cached file size
path = self.get_parquet_path(
asset_symbol, time_bucket, start_at, end_at, is_lending=True
)
total_size += os.path.getsize(path)
total_size += self._get_parquet_size(df, asset_symbol, time_bucket, start_at, end_at, is_lending=True)

if progress_bar:
progress_bar.set_postfix(
Expand Down Expand Up @@ -558,6 +552,13 @@ def get_data_parquet(
:param end_at: End date of the data
:return: Path to the parquet file
"""
non_date_string = self._get_non_date_string(symbol, time_bucket, is_lending)
file_name_with_bigger_date_range = self._get_file_name_with_bigger_date_range(non_date_string, start_at, end_at)
if file_name_with_bigger_date_range:
bigger_parquet_path = self.cache_directory.joinpath(file_name_with_bigger_date_range)
bigger_df = pd.read_parquet(bigger_parquet_path)
return bigger_df[start_at:end_at]

path = self.get_parquet_path(symbol, time_bucket, start_at, end_at, is_lending)
try:
return pd.read_parquet(path)
Expand All @@ -580,15 +581,69 @@ def get_parquet_path(
:param end_at: End date of the data
:return: Path to the parquet file
"""
non_date_string = self._get_non_date_string(symbol, time_bucket, is_lending)
file = Path(
f"{non_date_string}-{start_at}-{end_at}.parquet"
)
return self.cache_directory.joinpath(file)

def _get_file_name_with_bigger_date_range(self, non_date_string: str, start_at: datetime.datetime, end_at: datetime.datetime):
"""Get parquet file name with a bigger date range."""
for file in self.cache_directory.glob(f"{non_date_string}*.parquet"):
match = re.match(rf"{non_date_string}-(\d{{4}}-\d{{2}}-\d{{2}} \d{{2}}:\d{{2}}:\d{{2}})-(\d{{4}}-\d{{2}}-\d{{2}} \d{{2}}:\d{{2}}:\d{{2}}).parquet", file.name)
if match:
file_start_at = datetime.datetime.fromisoformat(match.group(1))
file_end_at = datetime.datetime.fromisoformat(match.group(2))
if file_start_at <= start_at and file_end_at >= end_at:
return file.name
return None

@staticmethod
def _get_non_date_string(
symbol: str,
time_bucket: TimeBucket,
is_lending: bool,
) -> str:
"""Get part of parquet file name that is not a date.

:param symbol: Trading pair symbol E.g. ETHUSDC
:param time_bucket: TimeBucket instance
:return: string
"""
if is_lending:
file_str = "lending"
else:
file_str = "candles"

file = Path(
file_str + f"-{symbol}-{time_bucket.value}-{start_at}-{end_at}.parquet"
)
return self.cache_directory.joinpath(file)
return f"{file_str}-{symbol}-{time_bucket.value}"

def _get_parquet_size(
self,
df: pd.DataFrame,
symbol: str,
time_bucket: TimeBucket,
start_at: datetime.datetime, end_at: datetime.datetime,
is_lending:bool=False,
) -> int:
"""Gets the size of the parquet file or the dataframe in memory if the parquet file does not exist.

:param df: Dataframe
:param symbol: Trading pair symbol E.g. ETHUSDC
:param time_bucket: TimeBucket instance
:param start_at: Start date of the data
:param end_at: End date of the data
:param is_lending: Whether the data is lending data or not
:return: Size in bytes
"""
try:
# Count the cached file size
path = self.get_parquet_path(symbol, time_bucket, start_at, end_at, is_lending)
size = os.path.getsize(path)
except:
# Count the size of the dataframe
# Can be inaccurate
# Divide by 2 based on rough testing for memory usage -> parquet file size
size = df.memory_usage(deep=True).sum()/2
return size

def overwrite_cached_data(
self,
Expand Down
Loading