import pandas as pd
import random
import string
import time
import os
import torch
import numpy as np
from typing import Union
[docs]
def fill_open_zeros(group: pd.DataFrame) -> pd.DataFrame:
"""Fills any zero values in the 'mid_price' column with the first non-zero value that occurs
before 9:35 AM within each group of data."""
if group.iloc[0]["mid_price"] == 0:
first_nonzero = group[
(group["mid_price"] != 0)
& (group["datetime"].dt.time <= pd.Timestamp("09:35").time())
]["mid_price"].iloc[0]
group.loc[group["mid_price"] == 0, "mid_price"] = first_nonzero
return group
[docs]
def generate_random_id(length: int = 10):
# Seed the random number generator with current time and os-specific random data
random.seed(int(time.time() * 1000) ^ os.getpid())
characters = string.ascii_letters + string.digits
return "".join(random.choice(characters) for _ in range(length))
[docs]
def datetime_to_tensor(
datetime_array: Union[str, np.datetime64], unit: str = "s"
) -> torch.Tensor:
"""
Convert various datetime formats to PyTorch tensor of timestamps.
Args:
datetime_array: array-like of datetime values (can be strings, datetime objects, or datetime64)
unit: time unit for conversion ('s' for seconds, 'ms' for milliseconds, etc.)
Returns:
PyTorch tensor of integer timestamps in the specified unit
"""
timestamps = []
for dt in datetime_array:
try:
# Handle string datetimes
if isinstance(dt, str):
dt = pd.to_datetime(dt)
# Convert to numpy datetime64 if it's a Python datetime
if not isinstance(dt, np.datetime64):
dt = np.datetime64(dt)
# Convert to the desired unit
timestamp = dt.astype(f"datetime64[{unit}]").astype(np.int64)
timestamps.append(timestamp)
except Exception as e:
print(f"Error converting datetime {dt}: {e}")
# Use epoch as fallback
timestamps.append(0)
return torch.tensor(timestamps, dtype=torch.int64)
[docs]
def tensor_to_datetime(
timestamp_tensor: torch.Tensor,
unit: str = "s",
batch_mode: bool = False,
) -> np.ndarray:
"""
Convert PyTorch tensor of timestamps back to numpy datetime64 array.
Args:
timestamp_tensor: PyTorch tensor of integer timestamps
unit: time unit used in conversion ('s' for seconds, 'ms' for milliseconds,
'us' for microseconds, 'ns' for nanoseconds)
batch_mode: If True, handles tensor as batched (2D) data
Returns:
numpy array of datetime64 values
"""
# Convert tensor to numpy array
timestamp_array = timestamp_tensor.cpu().numpy()
# Create datetime64 array with explicit time unit
result_dtype = f'datetime64[{unit}]'
# NumPy requires a specific string format for conversion from timestamps
epoch = np.datetime64("1970-01-01T00:00:00", unit)
if batch_mode:
# Handle batched data (2D array)
result = np.empty(timestamp_array.shape, dtype=result_dtype)
for i, batch in enumerate(timestamp_array):
for j, ts in enumerate(batch):
# Add the timestamp to the epoch
result[i, j] = epoch + np.timedelta64(int(ts), unit)
else:
# Handle non-batched data (1D array)
result = np.empty(timestamp_array.shape, dtype=result_dtype)
for i, ts in enumerate(timestamp_array):
# Add the timestamp to the epoch
result[i] = epoch + np.timedelta64(int(ts), unit)
return result
# Test: datetime conversion functions
import pandas as pd
from datetime import datetime
[docs]
def run_test():
print("Test 1: Single datetime array conversion")
# Create some datetime samples in different formats
dt_samples = [
np.datetime64("2023-01-01 12:00:00"),
datetime(2023, 1, 2, 13, 30, 45),
"2023-01-03 14:45:30",
pd.Timestamp("2023-01-04 16:15:00"),
]
print(f"Original sample: {dt_samples}")
# Convert to tensor
tensor = datetime_to_tensor(dt_samples)
print(f"Tensor: {tensor}")
# Convert back
dt_array = tensor_to_datetime(tensor)
print(f"Converted back: {dt_array}")
print(f"Original type: {type(dt_samples[0])}, Converted type: {type(dt_array[0])}")
print("\nTest 2: Batched datetime conversion")
# Create a batched tensor (2 batches, 3 timestamps each)
batch_1 = [
np.datetime64("2023-01-05 09:00:00"),
np.datetime64("2023-01-05 10:00:00"),
np.datetime64("2023-01-05 11:00:00"),
]
batch_2 = [
np.datetime64("2023-01-06 09:00:00"),
np.datetime64("2023-01-06 10:00:00"),
np.datetime64("2023-01-06 11:00:00"),
]
batched_samples = [batch_1, batch_2]
# Convert each batch to tensor
tensor_batch_1 = datetime_to_tensor(batch_1)
tensor_batch_2 = datetime_to_tensor(batch_2)
# Stack them into a batched tensor
batched_tensor = torch.stack([tensor_batch_1, tensor_batch_2])
print(f"Batched tensor shape: {batched_tensor.shape}")
print(f"Batched tensor: {batched_tensor}")
# Convert batched tensor back to datetime
batched_dt = tensor_to_datetime(batched_tensor, batch_mode=True)
print(f"Batched datetime array shape: {batched_dt.shape}")
print(f"Batched datetime array: {batched_dt}")
# Test full round-trip conversion for batched data
print("\nTest 2b: Batched round-trip conversion")
# Original datetime arrays
print(f"Original batch 1: {batch_1}")
print(f"Original batch 2: {batch_2}")
# Convert to tensors and stack
tensor_batch_1 = datetime_to_tensor(batch_1)
tensor_batch_2 = datetime_to_tensor(batch_2)
batched_tensor = torch.stack([tensor_batch_1, tensor_batch_2])
print(f"Batched tensor: {batched_tensor}")
# Convert back to datetime
batched_dt = tensor_to_datetime(batched_tensor, batch_mode=True)
print(f"Converted back: {batched_dt}")
# Compare original and converted values
print("Verification - original vs converted:")
for i in range(2):
for j in range(3):
original = batched_samples[i][j]
converted = batched_dt[i][j]
equal = original == converted
print(f" Batch {i}, index {j}: {original} → {converted}, Equal: {equal}")
print("\nTest 3: Real-world timestamp values")
# Use timestamps similar to what appears in your data
test_timestamps = torch.tensor(
[[1673256600, 1673260200, 1673263800], [1674729000, 1674732600, 1674736200]]
)
print(f"Test timestamps shape: {test_timestamps.shape}")
print(f"Test timestamps: {test_timestamps}")
test_dt = tensor_to_datetime(test_timestamps, batch_mode=True)
print(f"Converted datetime array: {test_dt}")
# Convert back to timestamps to verify round-trip
print("\nTest 3b: Real-world round-trip conversion")
# Flatten the 2D datetime array for conversion
test_dt_flat_1 = [dt for dt in test_dt[0]]
test_dt_flat_2 = [dt for dt in test_dt[1]]
# Convert back to tensors
reconverted_tensor_1 = datetime_to_tensor(test_dt_flat_1)
reconverted_tensor_2 = datetime_to_tensor(test_dt_flat_2)
# Stack back to original shape
reconverted_tensor = torch.stack([reconverted_tensor_1, reconverted_tensor_2])
print(f"Original timestamps: {test_timestamps}")
print(f"Reconverted timestamps: {reconverted_tensor}")
print(f"Equal: {torch.all(test_timestamps == reconverted_tensor).item()}")
if __name__ == "__main__":
run_test()