Dedicated to helping users do more in less time.
spinesUtils is a user-friendly toolkit for the machine learning ecosystem, offering ready-to-use features such as
- Logging functionality
- Type checking and parameter generation
- CSV file reading acceleration
- Classifiers for imbalanced data
- Pandas Dataframe data compression
- Pandas DataFrame insight tools
- Large data training and testing set splitting functions
- An intuitive timer.
It is currently undergoing rapid iteration. If you encounter any issues with its functionalities, feel free to raise an issue.
You can install spinesUtils from PyPI:
pip install spinesUtils
You can use the Logger class to print your logs without worrying about handler conflicts with the native Python logging module.
This class provides log/debug/info/warning/error/critical methods, where debug/info/warning/error/critical are partial versions of the log method, available for use as needed.
# load spinesUtils module
from spinesUtils.logging import Logger
# create a logger instance, with name "MyLogger", and no file handler, the default level is "INFO"
# You can specify a file path `fp` during instantiation. If not specified, logs will not be written to a file.
logger = Logger(name="MyLogger", fp=None, level="DEBUG")
logger.log("This is an info log emitted by the log function.", level='INFO')
logger.debug("This is an debug message")
logger.info("This is an info message.")
logger.warning("This is an warning message.")
logger.error("This is an error message.")
logger.critical("This is an critical message.")
2024-01-19 15:02:51 - MyLogger - INFO - This is an info log emitted by the log function.
2024-01-19 15:02:51 - MyLogger - DEBUG - This is an debug message
2024-01-19 15:02:51 - MyLogger - INFO - This is an info message.
2024-01-19 15:02:51 - MyLogger - WARNING - This is an warning message.
2024-01-19 15:02:51 - MyLogger - ERROR - This is an error message.
2024-01-19 15:02:51 - MyLogger - CRITICAL - This is an critical message.
from spinesUtils.asserts import *
# check parameter type
@ParameterTypeAssert({
'a': (int, float),
'b': (int, float)
})
def add(a, b):
pass
# try to pass a string to the function, and it will raise an ParametersTypeError error
add(a=1, b='2')
---------------------------------------------------------------------------
ParametersTypeError Traceback (most recent call last)
Cell In[2], line 12
9 pass
11 # try to pass a string to the function, and it will raise an ParametersTypeError error
---> 12 add(a=1, b='2')
File ~/projects/spinesUtils/spinesUtils/asserts/_inspect.py:196, in ParameterTypeAssert.__call__.<locals>.wrapper(*args, **kwargs)
194 if mismatched_params:
195 error_msg = self.build_type_error_msg(mismatched_params)
--> 196 raise ParametersTypeError(error_msg)
198 return func(**kwargs)
ParametersTypeError: Function 'add' parameter(s) type mismatch: b only accept '['int', 'float']' type.
# check parameter value
@ParameterValuesAssert({
'a': lambda x: x > 0,
'b': lambda x: x > 0
})
def add(a, b):
pass
# try to pass a negative number to the function, and it will raise an ParametersValueError error
add(a=1, b=-2)
---------------------------------------------------------------------------
ParametersValueError Traceback (most recent call last)
Cell In[3], line 10
7 pass
9 # try to pass a negative number to the function, and it will raise an ParametersValueError error
---> 10 add(a=1, b=-2)
File ~/projects/spinesUtils/spinesUtils/asserts/_inspect.py:258, in ParameterValuesAssert.__call__.<locals>.wrapper(*args, **kwargs)
256 if mismatched_params:
257 error_msg = self.build_values_error_msg(mismatched_params)
--> 258 raise ParametersValueError(error_msg)
260 return func(**kwargs)
ParametersValueError: Function 'add' parameter(s) values mismatch: `b` must in or satisfy ''b': lambda x: x > 0' condition(s).
# generate a dictionary of keyword arguments for a given function using provided arguments
generate_function_kwargs(add, a=1, b=2)
{'a': 1, 'b': 2}
# isinstance function with support for None
augmented_isinstance(1, (int, float, None))
True
# raise_if and raise_if_not functions
raise_if(ValueError, 1 == 1, "test raise_if")
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[6], line 2
1 # raise_if and raise_if_not functions
----> 2 raise_if(ValueError, 1 == 1, "test raise_if")
File ~/projects/spinesUtils/spinesUtils/asserts/_type_and_exceptions.py:115, in raise_if(exception, condition, error_msg)
112 assert issubclass(exception, BaseException), "Exception must be a subclass of BaseException."
114 if condition:
--> 115 raise exception(error_msg)
ValueError: test raise_if
raise_if_not(ZeroDivisionError, 1 != 1, "test raise_if_not")
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
Cell In[7], line 1
----> 1 raise_if_not(ZeroDivisionError, 1 != 1, "test raise_if_not")
File ~/projects/spinesUtils/spinesUtils/asserts/_type_and_exceptions.py:144, in raise_if_not(exception, condition, error_msg)
141 assert issubclass(exception, BaseException), "Exception must be a subclass of BaseException."
143 if not condition:
--> 144 raise exception(error_msg)
ZeroDivisionError: test raise_if_not
from spinesUtils import read_csv
your_df = read_csv(
fp='/path/to/your/file.csv',
sep=',', # equal to pandas read_csv.sep
turbo_method='polars', # use turbo_method to speed up load time
chunk_size=None, # it can be integer if you want to use pandas backend
transform2low_mem=True, # it can compresses file to save more memory
verbose=False
)
from spinesUtils.models import MultiClassBalanceClassifier
# make a toy dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
dataset = make_classification(
n_samples=10000,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_classes=3,
n_clusters_per_class=1,
weights=[0.01, 0.05, 0.94],
class_sep=0.8,
random_state=0
)
X, y = dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.ensemble import RandomForestClassifier
classifier = MultiClassBalanceClassifier(
base_estimator=RandomForestClassifier(n_estimators=100),
n_classes=3,
random_state=0,
verbose=0
)
# fit the classifier
classifier.fit(X_train, y_train)
# predict
y_pred = classifier.predict(X_test)
# print classification report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.74 0.72 0.73 32
1 0.91 0.71 0.80 111
2 0.98 1.00 0.99 1857
accuracy 0.98 2000
macro avg 0.88 0.81 0.84 2000
weighted avg 0.98 0.98 0.98 2000
# make a toy dataset
import pandas as pd
import numpy as np
df = pd.DataFrame({
'a': np.random.randint(0, 100, 100000),
'b': np.random.randint(0, 100, 100000),
'c': np.random.randint(0, 100, 100000),
'd': np.random.randint(0, 100, 100000),
'e': np.random.randint(0, 100, 100000),
'f': np.random.randint(0, 100, 100000),
'g': np.random.randint(0, 100, 100000),
'h': np.random.randint(0, 100, 100000),
'i': np.random.randint(0, 100, 100000),
'j': np.random.randint(0, 100, 100000),
'k': np.random.randint(0, 100, 100000),
'l': np.random.randint(0, 100, 100000),
'm': np.random.randint(0, 100, 100000),
'n': np.random.randint(0, 100, 100000),
'o': np.random.randint(0, 100, 100000),
'p': np.random.randint(0, 100, 100000),
'q': np.random.randint(0, 100, 100000),
'r': np.random.randint(0, 100, 100000),
's': np.random.randint(0, 100, 100000),
't': np.random.randint(0, 100, 100000),
'u': np.random.randint(0, 100, 100000),
'v': np.random.randint(0, 100, 100000),
'w': np.random.randint(0, 100, 100000),
'x': np.random.randint(0, 100, 100000),
'y': np.random.randint(0, 100, 100000),
'z': np.random.randint(0, 100, 100000),
})
# compress dataframe
from spinesUtils import transform_dtypes_low_mem
transform_dtypes_low_mem(df, verbose=True, inplace=True)
Converting ...: 0%| | 0/26 [00:00<?, ?it/s]
[log] INFO - Memory usage before conversion is: 19.84 MB
[log] INFO - Memory usage after conversion is: 2.48 MB
[log] INFO - After conversion, the percentage of memory fluctuation is 87.5 %
# batch compress dataframes
from spinesUtils import transform_batch_dtypes_low_mem
# make some toy datasets
df1 = pd.DataFrame({
'a': np.random.randint(0, 100, 100000),
'b': np.random.randint(0, 100, 100000),
'c': np.random.randint(0, 100, 100000),
'd': np.random.randint(0, 100, 100000),
'e': np.random.randint(0, 100, 100000),
'f': np.random.randint(0, 100, 100000),
'g': np.random.randint(0, 100, 100000),
'h': np.random.randint(0, 100, 100000),
'i': np.random.randint(0, 100, 100000),
'j': np.random.randint(0, 100, 100000),
'k': np.random.randint(0, 100, 100000),
'l': np.random.randint(0, 100, 100000),
'm': np.random.randint(0, 100, 100000),
'n': np.random.randint(0, 100, 100000),
'o': np.random.randint(0, 100, 100000),
'p': np.random.randint(0, 100, 100000),
'q': np.random.randint(0, 100, 100000),
'r': np.random.randint(0, 100, 100000),
's': np.random.randint(0, 100, 100000),
't': np.random.randint(0, 100, 100000),
'u': np.random.randint(0, 100, 100000),
'v': np.random.randint(0, 100, 100000),
'w': np.random.randint(0, 100, 100000),
'x': np.random.randint(0, 100, 100000),
'y': np.random.randint(0, 100, 100000),
'z': np.random.randint(0, 100, 100000),
})
df2 = df1.copy()
df3 = df1.copy()
df4 = df1.copy()
# batch compress dataframes
transform_batch_dtypes_low_mem([df1, df2, df3, df4], verbose=True, inplace=True)
Batch converting ...: 0%| | 0/4 [00:00<?, ?it/s]
[log] INFO - Memory usage before conversion is: 79.35 MB
[log] INFO - Memory usage after conversion is: 9.92 MB
[log] INFO - After conversion, the percentage of memory fluctuation is 87.5 %
from spinesUtils import df_preview, classify_samples_dist
# make a toy dataset
import pandas as pd
import numpy as np
df = pd.DataFrame({
'a': np.random.randint(0, 100, 100000),
'b': np.random.randint(0, 100, 100000),
'c': np.random.randint(0, 100, 100000),
'd': np.random.randint(0, 100, 100000),
'e': np.random.randint(0, 100, 100000),
'f': np.random.randint(0, 100, 100000),
'g': np.random.randint(0, 100, 100000),
'h': np.random.randint(0, 100, 100000),
'i': np.random.randint(0, 100, 100000),
'j': np.random.randint(0, 100, 100000),
'k': np.random.randint(0, 100, 100000),
'l': np.random.randint(0, 100, 100000),
'm': np.random.randint(0, 100, 100000),
'n': np.random.randint(0, 100, 100000),
'o': np.random.randint(0, 100, 100000),
'p': np.random.randint(0, 100, 100000),
'q': np.random.randint(0, 100, 100000),
'r': np.random.randint(0, 100, 100000),
's': np.random.randint(0, 100, 100000),
't': np.random.randint(0, 100, 100000),
'u': np.random.randint(0, 100, 100000),
'v': np.random.randint(0, 100, 100000),
'w': np.random.randint(0, 100, 100000),
'x': np.random.randint(0, 100, 100000),
'y': np.random.randint(0, 100, 100000),
'z': np.random.randint(0, 100, 100000),
})
df_insight = df_preview(df)
df_insight
total | na | naPercent | nunique | dtype | max | 75% | median | 25% | min | mean | mode | variation | std | skew | kurt | samples | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
a | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 50.0 | 25.0 | 0.0 | 49.53968 | 36 | 0.9892 | 28.848392 | -0.000158 | -1.196434 | (32, 81) |
b | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 49.0 | 24.0 | 0.0 | 49.41822 | 40 | 0.98928 | 28.937601 | 0.005974 | -1.206987 | (76, 28) |
c | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 25.0 | 0.0 | 49.58261 | 82 | 0.98923 | 28.928019 | -0.003537 | -1.202994 | (21, 68) |
d | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 49.0 | 24.0 | 0.0 | 49.46308 | 9 | 0.98906 | 28.886459 | 0.003344 | -1.200654 | (42, 90) |
e | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 49.0 | 25.0 | 0.0 | 49.55014 | 37 | 0.98911 | 28.834041 | 0.003987 | -1.196103 | (15, 59) |
f | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 49.0 | 24.0 | 0.0 | 49.20195 | 4 | 0.98926 | 28.886463 | 0.009183 | -1.203297 | (72, 9) |
g | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 25.0 | 0.0 | 49.62199 | 4 | 0.98919 | 28.849264 | -0.012746 | -1.199283 | (69, 64) |
h | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 25.0 | 0.0 | 49.58739 | 40 | 0.98917 | 28.83744 | -0.004719 | -1.193858 | (30, 79) |
i | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 49.0 | 24.0 | 0.0 | 49.41076 | 10 | 0.98939 | 28.910095 | 0.005218 | -1.207459 | (36, 54) |
j | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 49.0 | 25.0 | 0.0 | 49.45686 | 46 | 0.98909 | 28.816681 | 0.004751 | -1.190756 | (29, 95) |
k | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 50.0 | 25.0 | 0.0 | 49.54948 | 46 | 0.98914 | 28.806187 | -0.003731 | -1.196876 | (32, 94) |
l | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 49.0 | 24.0 | 0.0 | 49.45631 | 20 | 0.98923 | 28.921314 | 0.002344 | -1.205342 | (22, 91) |
m | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 49.0 | 24.0 | 0.0 | 49.43142 | 49 | 0.98901 | 28.852962 | 0.002507 | -1.198267 | (94, 26) |
n | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 24.0 | 0.0 | 49.49325 | 8 | 0.98931 | 28.899022 | 0.000698 | -1.200786 | (46, 50) |
o | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 25.0 | 0.0 | 49.52091 | 4 | 0.98923 | 28.869563 | -0.003987 | -1.202426 | (33, 13) |
p | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 49.0 | 24.0 | 0.0 | 49.40997 | 61 | 0.98918 | 28.900207 | 0.007921 | -1.204621 | (58, 93) |
q | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 25.0 | 0.0 | 49.62826 | 33 | 0.98936 | 28.831896 | -0.003291 | -1.201172 | (82, 31) |
r | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 24.0 | 0.0 | 49.47208 | 60 | 0.98925 | 28.873943 | 0.000515 | -1.202925 | (0, 26) |
s | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 25.0 | 0.0 | 49.64847 | 48 | 0.9893 | 28.853741 | -0.010258 | -1.202701 | (94, 37) |
t | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 50.0 | 25.0 | 0.0 | 49.55305 | 32 | 0.98898 | 28.801028 | -0.001721 | -1.193403 | (85, 10) |
u | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 49.0 | 24.0 | 0.0 | 49.45428 | 80 | 0.98928 | 28.876812 | 0.002018 | -1.201612 | (56, 16) |
v | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 75.0 | 50.0 | 25.0 | 0.0 | 49.59953 | 16 | 0.98945 | 28.891313 | -0.006261 | -1.199011 | (60, 39) |
w | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 49.0 | 24.0 | 0.0 | 49.34131 | 4 | 0.98915 | 28.925175 | 0.009523 | -1.203308 | (78, 96) |
x | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 49.0 | 25.0 | 0.0 | 49.45791 | 95 | 0.98933 | 28.860322 | 0.007199 | -1.198962 | (93, 79) |
y | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 50.0 | 25.0 | 0.0 | 49.58517 | 34 | 0.98929 | 28.765474 | -0.000497 | -1.193016 | (80, 42) |
z | 100000 | 0 | 0.0 | 100 | int64 | 99.0 | 74.0 | 50.0 | 24.0 | 0.0 | 49.44355 | 21 | 0.98876 | 28.85751 | 0.000819 | -1.201063 | (25, 25) |
# make a toy dataset
import pandas as pd
import numpy as np
df = pd.DataFrame({
'a': np.random.randint(0, 100, 100000),
'b': np.random.randint(0, 100, 100000),
'c': np.random.randint(0, 100, 100000),
'd': np.random.randint(0, 100, 100000),
'e': np.random.randint(0, 100, 100000),
'f': np.random.randint(0, 100, 100000),
'g': np.random.randint(0, 100, 100000),
'h': np.random.randint(0, 100, 100000),
'i': np.random.randint(0, 100, 100000),
'j': np.random.randint(0, 100, 100000),
'k': np.random.randint(0, 100, 100000),
'l': np.random.randint(0, 100, 100000),
'm': np.random.randint(0, 100, 100000),
'n': np.random.randint(0, 100, 100000),
'o': np.random.randint(0, 100, 100000),
'p': np.random.randint(0, 100, 100000),
'q': np.random.randint(0, 100, 100000),
'r': np.random.randint(0, 100, 100000),
's': np.random.randint(0, 100, 100000),
't': np.random.randint(0, 100, 100000),
'u': np.random.randint(0, 100, 100000),
'v': np.random.randint(0, 100, 100000),
'w': np.random.randint(0, 100, 100000),
'x': np.random.randint(0, 100, 100000),
'y': np.random.randint(0, 100, 100000),
'z': np.random.randint(0, 100, 100000),
})
# split dataframe into training and testing sets
# return numpy.ndarray
from spinesUtils import train_test_split_bigdata
from spinesUtils.feature_tools import get_x_cols
X_train, X_valid, X_test, y_train, y_valid, y_test = train_test_split_bigdata(
df=df,
x_cols=get_x_cols(df, y_col='a'),
y_col='a',
shuffle=True,
return_valid=True,
train_size=0.8,
valid_size=0.5
)
print(X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape)
X_train[:5]
(80000, 25) (80000,) (10000, 25) (10000,) (10000, 25) (10000,)
array([[45, 83, 43, 94, 1, 86, 56, 0, 78, 60, 79, 42, 24, 43, 94, 83,
45, 50, 59, 50, 17, 99, 40, 95, 70],
[ 4, 81, 9, 25, 54, 18, 14, 6, 17, 39, 0, 36, 82, 33, 11, 76,
92, 29, 33, 50, 44, 11, 87, 86, 31],
[72, 82, 52, 96, 55, 89, 35, 71, 48, 73, 34, 19, 53, 89, 46, 57,
84, 67, 10, 40, 50, 61, 10, 76, 84],
[46, 45, 79, 53, 80, 85, 58, 65, 26, 49, 46, 97, 83, 47, 77, 97,
26, 4, 33, 79, 36, 65, 50, 94, 87],
[36, 7, 46, 10, 11, 33, 3, 7, 82, 29, 28, 2, 42, 89, 42, 66,
79, 51, 49, 43, 63, 14, 13, 74, 26]])
# return pandas.DataFrame
from spinesUtils import train_test_split_bigdata_df
from spinesUtils.feature_tools import get_x_cols
train_df, valid_df, test_df = train_test_split_bigdata_df(
df=df,
x_cols=get_x_cols(df, y_col='a'),
y_col='a',
shuffle=True,
return_valid=True,
train_size=0.8,
valid_size=0.5
)
print(train_df.shape, valid_df.shape, test_df.shape)
train_df.head()
(8000000, 26) (1000000, 26) (1000000, 26)
b | c | d | e | f | g | h | i | j | k | ... | r | s | t | u | v | w | x | y | z | a | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14 | 67 | 41 | 87 | 68 | 87 | 27 | 67 | 26 | 62 | ... | 63 | 43 | 77 | 4 | 6 | 72 | 5 | 63 | 73 | 27 |
1 | 47 | 37 | 43 | 98 | 55 | 68 | 82 | 48 | 37 | 35 | ... | 99 | 92 | 23 | 44 | 92 | 14 | 54 | 95 | 58 | 59 |
2 | 52 | 97 | 71 | 62 | 18 | 54 | 22 | 2 | 57 | 93 | ... | 82 | 6 | 61 | 41 | 24 | 40 | 54 | 11 | 9 | 5 |
3 | 48 | 45 | 22 | 46 | 32 | 37 | 6 | 13 | 42 | 67 | ... | 9 | 1 | 65 | 84 | 11 | 86 | 54 | 22 | 89 | 85 |
4 | 26 | 23 | 55 | 31 | 61 | 72 | 68 | 82 | 6 | 19 | ... | 13 | 44 | 3 | 93 | 66 | 53 | 75 | 93 | 53 | 43 |
5 rows × 26 columns
# performances comparison
from sklearn.model_selection import train_test_split
from spinesUtils import train_test_split_bigdata, train_test_split_bigdata_df
from spinesUtils.feature_tools import get_x_cols
# make a toy dataset
import pandas as pd
import numpy as np
df = pd.DataFrame({
'a': np.random.randint(0, 100, 10000),
'b': np.random.randint(0, 100, 10000),
'c': np.random.randint(0, 100, 10000),
'd': np.random.randint(0, 100, 10000),
'e': np.random.randint(0, 100, 10000),
'f': np.random.randint(0, 100, 10000),
'g': np.random.randint(0, 100, 10000),
'h': np.random.randint(0, 100, 10000),
'i': np.random.randint(0, 100, 10000),
'j': np.random.randint(0, 100, 10000),
'k': np.random.randint(0, 100, 10000),
'l': np.random.randint(0, 100, 10000),
'm': np.random.randint(0, 100, 10000),
'n': np.random.randint(0, 100, 10000),
'o': np.random.randint(0, 100, 10000),
'p': np.random.randint(0, 100, 10000),
'q': np.random.randint(0, 100, 10000),
'r': np.random.randint(0, 100, 10000),
's': np.random.randint(0, 100, 10000),
't': np.random.randint(0, 100, 10000),
'u': np.random.randint(0, 100, 10000),
'v': np.random.randint(0, 100, 10000),
'w': np.random.randint(0, 100, 10000),
'x': np.random.randint(0, 100, 10000),
'y': np.random.randint(0, 100, 10000),
'z': np.random.randint(0, 100, 10000),
})
# define a function to split a valid set for sklearn train_test_split
def train_test_split_sklearn(df, x_cols, y_col, shuffle, train_size, valid_size):
X_train, X_test, y_train, y_test = train_test_split(df[x_cols], df[y_col], test_size=1-train_size, random_state=0, shuffle=shuffle)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=valid_size, random_state=0, shuffle=shuffle)
return X_train, X_valid, X_test, y_train, y_valid, y_test
%timeit X_train, X_valid, X_test, y_train, y_valid, y_test = train_test_split_sklearn(df=df, x_cols=get_x_cols(df, y_col='a'), y_col='a', shuffle=True, train_size=0.8, valid_size=0.5)
%timeit X_train, X_valid, X_test, y_train, y_valid, y_test = train_test_split_bigdata(df=df, x_cols=get_x_cols(df, y_col='a'), y_col='a', shuffle=True, return_valid=True, train_size=0.8, valid_size=0.5)
%timeit train_df, valid_df, test_df = train_test_split_bigdata_df(df=df, x_cols=get_x_cols(df, y_col='a'), y_col='a', shuffle=True, return_valid=True, train_size=0.8, valid_size=0.5)
1.28 ms ± 20.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.05 ms ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.36 ms ± 11.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
from spinesUtils.timer import Timer
# create a timer instance
timer = Timer()
# start the timer
timer.start()
# do something
for i in range(10):
# timer sleep for 1 second
timer.sleep(1)
# print the elapsed time from last sleep
print("Elapsed time: {} seconds".format(timer.last_timestamp_diff()))
# print the elapsed time
print("Total elapsed time: {} seconds".format(timer.total_elapsed_time()))
# stop the timer
timer.end()
Elapsed time: 1.0117900371551514 seconds
Elapsed time: 2.016140937805176 seconds
Elapsed time: 3.0169479846954346 seconds
Elapsed time: 4.0224690437316895 seconds
Elapsed time: 5.027086019515991 seconds
Elapsed time: 6.0309507846832275 seconds
Elapsed time: 7.035104036331177 seconds
Elapsed time: 8.040709972381592 seconds
Elapsed time: 9.042311906814575 seconds
Elapsed time: 10.046867847442627 seconds
Total elapsed time: 10.047839879989624 seconds
10.047943830490112
from spinesUtils.timer import Timer
# you can also use the timer as a context manager
t = Timer()
with t.session():
t.sleep(1)
print("Last step elapsed time:", round(t.last_timestamp_diff(), 2), 'seconds')
t.middle_point()
t.sleep(2)
print("Last step elapsed time:", round(t.last_timestamp_diff(), 2), 'seconds')
total_elapsed_time = t.total_elapsed_time()
print("Total Time:", round(total_elapsed_time, 2), 'seconds')
Last step elapsed time: 1.01 seconds
Last step elapsed time: 2.01 seconds
Total Time: 3.01 seconds