joblib.Memory doesn’t seem to be performing correct hash computation, so I wrote a ‘cache’ decorator. It considers parameters and the argument values to determine if cached value can be used. As a bonus, this even can consider functions as the arguments.
def cache(params_to_ignore: set[str]=None):
"""Decorator to cache the return value of a function."""
def cache_decorator(func):
def wrapper(*args, **kwargs):
call_args = inspect.getcallargs(func, *args, **kwargs)
h = hashlib.sha256()
h.update(func.__module__.encode('utf8'))
h.update(func.__name__.encode('utf8'))
nonlocal params_to_ignore
if params_to_ignore is None:
params_to_ignore = set()
for k, v in call_args.items():
if k in params_to_ignore:
continue
h.update(k.encode('utf8'))
if isinstance(v, pd.DataFrame):
# column names are excluded in hash_pandas_object.
h.update(str(v.columns).encode('utf8'))
if isinstance(v, pd.Series | pd.DataFrame | pd.Index):
for pd_hash in hash_pandas_object(v):
h.update(str(pd_hash).encode('utf8'))
elif inspect.isfunction(v):
h.update(inspect.getsource(v).encode('utf8'))
elif isinstance(v, str | int | float | bool):
h.update(str(v).encode('utf8'))
else:
# e.g., list of pd.DataFrame
raise RuntimeError('Not supported type: ' + str(type(v)))
filename = f'{h.hexdigest()}.pkl'
if should_use_cache():
cached = load(filename=filename)
if cached is not None:
return cached
ret_val = func(*args, **kwargs)
save(ret_val, filename=filename)
return ret_val
return wrapper
return cache_decorator
You need to write your own save(), load(), and should_use_cache(). The purpose of should_use_cache() is to temporarily disable cache operations (or to rewrite cached values).
Use the above like:
@cache({'b'}) # param 'b' is ignored from hash key
def foo(a, b, c):
... long computation ...
# Call anyway you want.
foo(1, 2, 3)
foo(1, b=2, c=3)
I’m using the caching technique above for my quant programs :-)