Crypto tick and bar data: Difference between revisions

From Volatility.RED
No edit summary
 
(One intermediate revision by the same user not shown)
Line 67: Line 67:


<syntaxhighlight lang="python">
<syntaxhighlight lang="python">
# INSERT FULL KRAKEN HISTORICAL DATA SCRIPT HERE
#!/usr/bin/env python3
"""
Kraken Historical Data Collection Script
 
Copyright (c) 2025 Volatility.RED and FXGears.com
All rights reserved.
 
This software is provided for NON-COMMERCIAL USE ONLY.
 
Permissions:
- Personal use and modification for individual trading and research
- Educational use in academic or learning environments
- Internal use within organizations for research purposes
 
Restrictions:
- NO REDISTRIBUTION: This code may not be redistributed, shared, or published
  in any form, including but not limited to: websites, forums, repositories,
  social media, or any other public or private distribution channels
- NO REPUBLISHING: This code may not be republished or included in other
  software packages, tutorials, or educational materials
- NO COMMERCIAL USE: This code may not be used for commercial purposes,
  including but not limited to: selling, licensing, or incorporating into
  commercial products or services
 
 
 
THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND.
THE AUTHORS SHALL NOT BE LIABLE FOR ANY DAMAGES ARISING FROM ITS USE.
"""
 
import urllib.request
import urllib.parse
import urllib.error
import json
import csv
import hmac
import hashlib
import base64
import time
import os
from datetime import datetime, timedelta
 
# USER CONFIGURATION - Modify these variables as needed
SYMBOL = "XBTUSD"  # Trading pair (e.g., XBTUSD, ETHUSD, ADAUSD)
DATA_TYPE = "trades"  # Options: "trades", "ohlc", "spread"
 
#Not used for "trades" type data
INTERVAL = 1  # For OHLC data: 1, 5, 15, 30, 60, 240, 1440, 10080, 21600 (minutes)
 
DAYS = 10 # Pull data from X days back until now.
SINCE = int((datetime.now() - timedelta(days=DAYS)).timestamp())  # Unix timestamp or None for all available data
SLEEP_DELAY = 0.01  # Delay between API requests in seconds
RATE_LIMIT_WAIT = 5  # Base wait time (in seconds) for rate limit errors (exponential backoff multiplier)
BATCH_SIZE = 5000  # Number of records to request per API call
 
OUTPUT_FILE = "kraken_{}_{}_data_{}_to_{}.csv"  # Output filename
API_KEY = ""  # Your Kraken API key
API_SECRET = ""  # Your Kraken API secret
 
# Kraken API endpoints
BASE_URL = "https://api.kraken.com/0/public/"
PRIVATE_URL = "https://api.kraken.com/0/private/"
 
def get_auth_headers(uri_path, query_params):
    """Generate authentication headers for Kraken API"""
    if not API_KEY or not API_SECRET:
        return {}, ""
   
    try:
        nonce = str(int(time.time() * 1000))  # Use milliseconds for nonce
       
        # Separate query string (API params) from body (nonce only)
        query_str = urllib.parse.urlencode(query_params) if query_params else ""
        body = {"nonce": nonce}
        body_str = json.dumps(body)
       
        # Kraken's signature method: path + SHA256(nonce + query_str + body_str)
        combined_data = nonce + query_str + body_str
        sha256_hash = hashlib.sha256(combined_data.encode()).digest()
        message = uri_path.encode() + sha256_hash
        signature = hmac.new(base64.b64decode(API_SECRET), message, hashlib.sha512)
       
        headers = {
            'API-Key': API_KEY,
            'API-Sign': base64.b64encode(signature.digest()).decode(),
            'Content-Type': 'application/json'
        }
        return headers, body_str
       
    except Exception as e:
        print(f"Auth headers: Error generating signature - {e}")
        import traceback
        traceback.print_exc()
        return {}, ""
 
def make_request(url, params, use_auth=False, max_retries=3):
    """Make API request with optional authentication and retry logic"""
    for attempt in range(max_retries):
        try:
            # Build URL with query parameters
            query_str = urllib.parse.urlencode(params) if params else ""
            full_url = f"{url}?{query_str}" if query_str else url
           
            headers = {}
            body_data = None
            if use_auth and API_KEY and API_SECRET:
                uri_path = url.replace('https://api.kraken.com', '')
                headers, body_str = get_auth_headers(uri_path, params)
                body_data = body_str.encode() if body_str else None
                if headers and attempt == 0:
                    print(f"AUTH: Using authenticated GET request to {uri_path}")
            else:
                if attempt == 0:
                    print("AUTH: Using public GET request (no authentication)")
           
            req = urllib.request.Request(full_url, data=body_data, headers=headers)
            response = urllib.request.urlopen(req)
           
            response_text = response.read().decode()
            data = json.loads(response_text)
           
        except urllib.error.HTTPError as e:
            print(f"HTTP Error {e.code}: {e.read().decode()}")
            return {'error': [f'HTTP {e.code} error']}
        except Exception as e:
            print(f"DEBUG: Failed to parse JSON or make request: {e}")
            return {'error': ['Failed to parse response']}
       
        # Debug: Check response for auth issues
        if 'error' in data and data['error']:
            if any('invalid' in str(err).lower() or 'auth' in str(err).lower() for err in data['error']):
                print(f"AUTH ERROR: {data['error']} - Check API credentials")
       
        # Check for rate limit error
        if 'error' in data and any('too many requests' in str(err).lower() for err in data['error']):
            wait_time = (2 ** attempt) * RATE_LIMIT_WAIT  # Exponential backoff
            print(f"RATE LIMIT: Hit limit (attempt {attempt + 1}), waiting {wait_time} seconds...")
            time.sleep(wait_time)
            continue
       
        return data
   
    return data  # Return last response if all retries failed
 
def get_historical_trades(pair, filename, since=None):
    """Get all historical trade data with pagination and write directly to CSV"""
    url = f"{BASE_URL}Trades"
    current_since = since
    total_records = 0
    first_batch = True
   
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['timestamp', 'price', 'volume', 'buy_sell', 'market_limit', 'misc'])
       
        while True:
            params = {"pair": pair, "count": BATCH_SIZE}
            if current_since:
                params["since"] = current_since
           
            data = make_request(url, params, use_auth=True)
            if 'error' in data and data['error']:
                return {'error': data['error']}
           
            batch_count = 0
            for pair_key in data['result']:
                if pair_key != 'last':
                    trades = data['result'][pair_key]
                    batch_count = len(trades)
                    for trade in trades:
                        writer.writerow([
                            datetime.fromtimestamp(float(trade[2])),
                            trade[0], trade[1], trade[3], trade[4], trade[5]
                        ])
                    total_records += batch_count
           
            if 'last' in data['result'] and batch_count == BATCH_SIZE:
                print(f"Fetched {total_records} records...")
                current_since = data['result']['last']
                time.sleep(SLEEP_DELAY)
            else:
                break
   
    return {'total_records': total_records}
 
def get_historical_ohlc(pair, filename, interval=1, since=None):
    """Get all historical OHLC data with pagination and write directly to CSV"""
    url = f"{BASE_URL}OHLC"
    current_since = since
    total_records = 0
   
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['timestamp', 'open', 'high', 'low', 'close', 'vwap', 'volume', 'count'])
       
        while True:
            params = {"pair": pair, "interval": interval}
            if current_since:
                params["since"] = current_since
           
            data = make_request(url, params, use_auth=True)
            if 'error' in data and data['error']:
                return {'error': data['error']}
           
            batch_count = 0
            for pair_key in data['result']:
                if pair_key != 'last':
                    ohlc_data = data['result'][pair_key]
                    batch_count = len(ohlc_data)
                    for candle in ohlc_data:
                        writer.writerow([
                            datetime.fromtimestamp(float(candle[0])),
                            candle[1], candle[2], candle[3], candle[4], candle[5], candle[6], candle[7]
                        ])
                    total_records += batch_count
           
            if 'last' in data['result'] and batch_count == 720:
                print(f"Fetched {total_records} records...")
                current_since = data['result']['last']
                time.sleep(SLEEP_DELAY)
            else:
                break
   
    return {'total_records': total_records}
 
 
 
 
 
# Dispatch table for data type handlers
DATA_HANDLERS = {
    "trades": lambda symbol, filename, since: get_historical_trades(symbol, filename, since),
    "ohlc": lambda symbol, filename, since: get_historical_ohlc(symbol, filename, INTERVAL, since)
}
 
def test_authentication():
    """Test if authentication is working by calling a private endpoint"""
    url = f"{PRIVATE_URL}OpenOrders"
    params = {}
    print(f"DEBUG: Testing auth with URL: {url}")
    data = make_request(url, params, use_auth=True)
   
    print(f"DEBUG: Full response: {data}")
   
    if 'error' in data and data['error']:
        # Check if it's a permission error vs auth error
        if any('permission' in str(err).lower() for err in data['error']):
            print("AUTH TEST PASSED: Authentication working but insufficient permissions")
            return True
        else:
            print(f"AUTH TEST FAILED: {data['error']}")
            return False
    else:
        print("AUTH TEST PASSED: Authentication is working")
        return True
 
def main():
    # Check for required authentication
    if not API_KEY or not API_SECRET:
        print("ERROR: API_KEY and API_SECRET are required for data collection.")
        print("Please set your Kraken API credentials in the script configuration.")
        print("Register a free account at https://geni.us/GoKraken to get API access.")
        exit(1)
   
    print(f"Downloading {DATA_TYPE} data for {SYMBOL}...")
    print(f"Fetching data from {datetime.fromtimestamp(SINCE)} to now...")
    print("Using authenticated requests for optimal performance")
   
    try:
        start_date = datetime.fromtimestamp(SINCE).strftime('%Y%m%d')
        end_date = datetime.now().strftime('%Y%m%d')
        filename = OUTPUT_FILE.format(SYMBOL, DATA_TYPE, start_date, end_date)
       
        if os.path.exists(filename):
            os.remove(filename)
            print(f"Removed existing file: {filename}")
       
        handler = DATA_HANDLERS.get(DATA_TYPE)
        if not handler:
            print(f"Invalid DATA_TYPE '{DATA_TYPE}'. Use: {', '.join(DATA_HANDLERS.keys())}")
            exit(1)
       
        result = handler(SYMBOL, filename, SINCE)
       
        if 'error' in result:
            print(f"API Error: {result['error']}")
            exit(1)
        else:
            print(f"Data saved to {filename} ({result['total_records']} records)")
           
    except Exception as e:
        print(f"Error: {e}")
        exit(1)
 
if __name__ == "__main__":
    main()
</syntaxhighlight>
</syntaxhighlight>


Line 275: Line 570:
</pre>
</pre>
* '''Solution''': Use [https://geni.us/GoKraken Kraken's] official pair names (XBTUSD, not BTCUSD)
* '''Solution''': Use [https://geni.us/GoKraken Kraken's] official pair names (XBTUSD, not BTCUSD)
* '''Reference''': Check [https://api.kraken.com/0/public/AssetPairs [https://geni.us/GoKraken Kraken] Asset Pairs]
* '''Reference''': Check [https://geni.us/GoKraken Kraken Asset Pairs]


==== Optimization Tips ====
==== Optimization Tips ====

Latest revision as of 11:29, 29 August 2025

Crypto Historical Data Collection

This guide covers collecting both tick-level trade data and OHLC (candlestick) data from Kraken using a single Python script. The script can collect either individual trades (tick data) or aggregated OHLC bars at various timeframes, providing comprehensive historical market data for backtesting and algorithmic trading research.

Data Types Available

Tick (Trades) Data captures every individual trade executed on the exchange:

  • Price - The exact execution price of the trade
  • Volume - The quantity of the asset traded
  • Timestamp - Precise time when the trade occurred (microsecond precision)
  • Side - Whether the trade was a buy or sell (market taker direction)
  • Type - Market order vs limit order classification

OHLC (Candlestick) Data provides aggregated price action over specific time intervals:

  • Open - First trade price in the interval
  • High - Highest trade price in the interval
  • Low - Lowest trade price in the interval
  • Close - Last trade price in the interval
  • Volume - Total volume traded in the interval
  • VWAP - Volume-weighted average price
  • Count - Number of trades in the interval

Prerequisites

Before collecting historical data from Kraken, ensure you have:

  1. Python 3.7+ installed on your system
  2. Kraken account registration - Required for API access
  3. Kraken API credentials (API Key and Secret)
  4. Basic understanding of cryptocurrency trading concepts
  5. Sufficient storage space - Data files can be several GB for active pairs
  6. Stable internet connection - Data collection may run for hours

Account Registration Required: Practical data collection from Kraken requires API authentication to access reasonable batch sizes and request limits. Without authentication, data collection becomes prohibitively slow for any meaningful historical analysis.

Registering a Kraken account is free and requires no trading activity or deposits. Kraken is a fully regulated exchange, licensed in multiple jurisdictions and trusted by millions of users worldwide. Account registration takes only a few minutes and provides the API access essential for efficient data collection.

Kraken API Setup

Account Registration

If you don't have a Kraken account:

  1. Visit Kraken.com and click Create Account
  2. Complete email verification and basic information
  3. No identity verification required for API-only usage
  4. No deposits or trading required - API access is immediate

Creating API Credentials

  1. Log into your Kraken account
  2. Navigate to SettingsAPI
  3. Click Generate New Key
  4. Set permissions to Query Funds and Query Open Orders (minimum required)
  5. Important: Store your API Secret securely - it's only shown once

Understanding Rate Limits

Kraken enforces rate limiting on their API:

  • Authenticated requests: 5000 records per batch, 60 requests per minute
  • Rate limit errors: Exponential backoff required

The script handles rate limiting automatically with configurable delays and retry logic.

Code Implementation

Create a new Python file and configure the following parameters at the top:

#!/usr/bin/env python3
"""
Kraken Historical Data Collection Script

Copyright (c) 2025 Volatility.RED and FXGears.com
All rights reserved.

This software is provided for NON-COMMERCIAL USE ONLY.

Permissions:
- Personal use and modification for individual trading and research
- Educational use in academic or learning environments
- Internal use within organizations for research purposes

Restrictions:
- NO REDISTRIBUTION: This code may not be redistributed, shared, or published
  in any form, including but not limited to: websites, forums, repositories,
  social media, or any other public or private distribution channels
- NO REPUBLISHING: This code may not be republished or included in other
  software packages, tutorials, or educational materials
- NO COMMERCIAL USE: This code may not be used for commercial purposes,
  including but not limited to: selling, licensing, or incorporating into
  commercial products or services



THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND.
THE AUTHORS SHALL NOT BE LIABLE FOR ANY DAMAGES ARISING FROM ITS USE.
"""

import urllib.request
import urllib.parse
import urllib.error
import json
import csv
import hmac
import hashlib
import base64
import time
import os
from datetime import datetime, timedelta

# USER CONFIGURATION - Modify these variables as needed
SYMBOL = "XBTUSD"  # Trading pair (e.g., XBTUSD, ETHUSD, ADAUSD)
DATA_TYPE = "trades"  # Options: "trades", "ohlc", "spread"

#Not used for "trades" type data
INTERVAL = 1  # For OHLC data: 1, 5, 15, 30, 60, 240, 1440, 10080, 21600 (minutes)

DAYS = 10 # Pull data from X days back until now.
SINCE = int((datetime.now() - timedelta(days=DAYS)).timestamp())  # Unix timestamp or None for all available data
SLEEP_DELAY = 0.01  # Delay between API requests in seconds
RATE_LIMIT_WAIT = 5  # Base wait time (in seconds) for rate limit errors (exponential backoff multiplier)
BATCH_SIZE = 5000  # Number of records to request per API call

OUTPUT_FILE = "kraken_{}_{}_data_{}_to_{}.csv"  # Output filename
API_KEY = ""  # Your Kraken API key
API_SECRET = ""  # Your Kraken API secret

# Kraken API endpoints
BASE_URL = "https://api.kraken.com/0/public/"
PRIVATE_URL = "https://api.kraken.com/0/private/"

def get_auth_headers(uri_path, query_params):
    """Generate authentication headers for Kraken API"""
    if not API_KEY or not API_SECRET:
        return {}, ""
    
    try:
        nonce = str(int(time.time() * 1000))  # Use milliseconds for nonce
        
        # Separate query string (API params) from body (nonce only)
        query_str = urllib.parse.urlencode(query_params) if query_params else ""
        body = {"nonce": nonce}
        body_str = json.dumps(body)
        
        # Kraken's signature method: path + SHA256(nonce + query_str + body_str)
        combined_data = nonce + query_str + body_str
        sha256_hash = hashlib.sha256(combined_data.encode()).digest()
        message = uri_path.encode() + sha256_hash
        signature = hmac.new(base64.b64decode(API_SECRET), message, hashlib.sha512)
        
        headers = {
            'API-Key': API_KEY,
            'API-Sign': base64.b64encode(signature.digest()).decode(),
            'Content-Type': 'application/json'
        }
        return headers, body_str
        
    except Exception as e:
        print(f"Auth headers: Error generating signature - {e}")
        import traceback
        traceback.print_exc()
        return {}, ""

def make_request(url, params, use_auth=False, max_retries=3):
    """Make API request with optional authentication and retry logic"""
    for attempt in range(max_retries):
        try:
            # Build URL with query parameters
            query_str = urllib.parse.urlencode(params) if params else ""
            full_url = f"{url}?{query_str}" if query_str else url
            
            headers = {}
            body_data = None
            if use_auth and API_KEY and API_SECRET:
                uri_path = url.replace('https://api.kraken.com', '')
                headers, body_str = get_auth_headers(uri_path, params)
                body_data = body_str.encode() if body_str else None
                if headers and attempt == 0:
                    print(f"AUTH: Using authenticated GET request to {uri_path}")
            else:
                if attempt == 0:
                    print("AUTH: Using public GET request (no authentication)")
            
            req = urllib.request.Request(full_url, data=body_data, headers=headers)
            response = urllib.request.urlopen(req)
            
            response_text = response.read().decode()
            data = json.loads(response_text)
            
        except urllib.error.HTTPError as e:
            print(f"HTTP Error {e.code}: {e.read().decode()}")
            return {'error': [f'HTTP {e.code} error']}
        except Exception as e:
            print(f"DEBUG: Failed to parse JSON or make request: {e}")
            return {'error': ['Failed to parse response']}
        
        # Debug: Check response for auth issues
        if 'error' in data and data['error']:
            if any('invalid' in str(err).lower() or 'auth' in str(err).lower() for err in data['error']):
                print(f"AUTH ERROR: {data['error']} - Check API credentials")
        
        # Check for rate limit error
        if 'error' in data and any('too many requests' in str(err).lower() for err in data['error']):
            wait_time = (2 ** attempt) * RATE_LIMIT_WAIT  # Exponential backoff
            print(f"RATE LIMIT: Hit limit (attempt {attempt + 1}), waiting {wait_time} seconds...")
            time.sleep(wait_time)
            continue
        
        return data
    
    return data  # Return last response if all retries failed

def get_historical_trades(pair, filename, since=None):
    """Get all historical trade data with pagination and write directly to CSV"""
    url = f"{BASE_URL}Trades"
    current_since = since
    total_records = 0
    first_batch = True
    
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['timestamp', 'price', 'volume', 'buy_sell', 'market_limit', 'misc'])
        
        while True:
            params = {"pair": pair, "count": BATCH_SIZE}
            if current_since:
                params["since"] = current_since
            
            data = make_request(url, params, use_auth=True)
            if 'error' in data and data['error']:
                return {'error': data['error']}
            
            batch_count = 0
            for pair_key in data['result']:
                if pair_key != 'last':
                    trades = data['result'][pair_key]
                    batch_count = len(trades)
                    for trade in trades:
                        writer.writerow([
                            datetime.fromtimestamp(float(trade[2])),
                            trade[0], trade[1], trade[3], trade[4], trade[5]
                        ])
                    total_records += batch_count
            
            if 'last' in data['result'] and batch_count == BATCH_SIZE:
                print(f"Fetched {total_records} records...")
                current_since = data['result']['last']
                time.sleep(SLEEP_DELAY)
            else:
                break
    
    return {'total_records': total_records}

def get_historical_ohlc(pair, filename, interval=1, since=None):
    """Get all historical OHLC data with pagination and write directly to CSV"""
    url = f"{BASE_URL}OHLC"
    current_since = since
    total_records = 0
    
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['timestamp', 'open', 'high', 'low', 'close', 'vwap', 'volume', 'count'])
        
        while True:
            params = {"pair": pair, "interval": interval}
            if current_since:
                params["since"] = current_since
            
            data = make_request(url, params, use_auth=True)
            if 'error' in data and data['error']:
                return {'error': data['error']}
            
            batch_count = 0
            for pair_key in data['result']:
                if pair_key != 'last':
                    ohlc_data = data['result'][pair_key]
                    batch_count = len(ohlc_data)
                    for candle in ohlc_data:
                        writer.writerow([
                            datetime.fromtimestamp(float(candle[0])),
                            candle[1], candle[2], candle[3], candle[4], candle[5], candle[6], candle[7]
                        ])
                    total_records += batch_count
            
            if 'last' in data['result'] and batch_count == 720:
                print(f"Fetched {total_records} records...")
                current_since = data['result']['last']
                time.sleep(SLEEP_DELAY)
            else:
                break
    
    return {'total_records': total_records}





# Dispatch table for data type handlers
DATA_HANDLERS = {
    "trades": lambda symbol, filename, since: get_historical_trades(symbol, filename, since),
    "ohlc": lambda symbol, filename, since: get_historical_ohlc(symbol, filename, INTERVAL, since)
}

def test_authentication():
    """Test if authentication is working by calling a private endpoint"""
    url = f"{PRIVATE_URL}OpenOrders"
    params = {}
    print(f"DEBUG: Testing auth with URL: {url}")
    data = make_request(url, params, use_auth=True)
    
    print(f"DEBUG: Full response: {data}")
    
    if 'error' in data and data['error']:
        # Check if it's a permission error vs auth error
        if any('permission' in str(err).lower() for err in data['error']):
            print("AUTH TEST PASSED: Authentication working but insufficient permissions")
            return True
        else:
            print(f"AUTH TEST FAILED: {data['error']}")
            return False
    else:
        print("AUTH TEST PASSED: Authentication is working")
        return True

def main():
    # Check for required authentication
    if not API_KEY or not API_SECRET:
        print("ERROR: API_KEY and API_SECRET are required for data collection.")
        print("Please set your Kraken API credentials in the script configuration.")
        print("Register a free account at https://geni.us/GoKraken to get API access.")
        exit(1)
    
    print(f"Downloading {DATA_TYPE} data for {SYMBOL}...")
    print(f"Fetching data from {datetime.fromtimestamp(SINCE)} to now...")
    print("Using authenticated requests for optimal performance")
    
    try:
        start_date = datetime.fromtimestamp(SINCE).strftime('%Y%m%d')
        end_date = datetime.now().strftime('%Y%m%d')
        filename = OUTPUT_FILE.format(SYMBOL, DATA_TYPE, start_date, end_date)
        
        if os.path.exists(filename):
            os.remove(filename)
            print(f"Removed existing file: {filename}")
        
        handler = DATA_HANDLERS.get(DATA_TYPE)
        if not handler:
            print(f"Invalid DATA_TYPE '{DATA_TYPE}'. Use: {', '.join(DATA_HANDLERS.keys())}")
            exit(1)
        
        result = handler(SYMBOL, filename, SINCE)
        
        if 'error' in result:
            print(f"API Error: {result['error']}")
            exit(1)
        else:
            print(f"Data saved to {filename} ({result['total_records']} records)")
            
    except Exception as e:
        print(f"Error: {e}")
        exit(1)

if __name__ == "__main__":
    main()

Configuration Parameters

The script uses several key configuration variables:

Parameter Description Example Values
SYMBOL Trading pair to collect "XBTUSD", "ETHUSD", "ADAUSD"
DATA_TYPE Type of data to collect "trades", "ohlc", "spread"
INTERVAL OHLC timeframe (minutes) 1, 5, 15, 30, 60, 240, 1440
DAYS Historical lookback period 7, 30, 90
BATCH_SIZE Records per API request 5000 (recommended)
SLEEP_DELAY Delay between requests 0.01 to 1.0 seconds
API_KEY Your Kraken API key "your_api_key_here"
API_SECRET Your Kraken API secret "your_api_secret_here"

Tick (Trades) Data Collection

Configuration for Tick Data

To collect individual trade data, set these parameters:

SYMBOL = "XBTUSD"  # Trading pair
DATA_TYPE = "trades"  # Collect tick data
DAYS = 7  # Last 7 days of data

Running for Tick Data

python kraken_historical_data.py

The script will:

  1. Authenticate with Kraken API
  2. Begin fetching trade data in batches of 5000
  3. Stream data directly to CSV file
  4. Display progress every 1000 records
  5. Handle rate limits automatically

Tick Data Format

Tick data CSV files contain the following columns:

Column Description Example Value
timestamp Trade execution time 2024-01-15 14:23:17.123456
price Execution price 42350.50
volume Trade volume 0.15234
buy_sell Trade direction "b" (buy) or "s" (sell)
market_limit Order type "m" (market) or "l" (limit)
misc Additional flags "" (usually empty)

Sample tick data output:

timestamp,price,volume,buy_sell,market_limit,misc
2024-01-15 14:23:17.123456,42350.50,0.15234,b,m,
2024-01-15 14:23:17.234567,42351.00,0.05000,b,l,
2024-01-15 14:23:18.345678,42349.75,0.25000,s,m,

OHLC (Minute Bar) Data Collection

Configuration for OHLC Data

To collect OHLC candlestick data, set these parameters:

SYMBOL = "XBTUSD"  # Trading pair
DATA_TYPE = "ohlc"  # Collect OHLC data
INTERVAL = 1  # 1-minute bars
DAYS = 30  # Last 30 days of data

Available Timeframes

Kraken supports the following OHLC intervals (in minutes):

  • 1 - 1 minute bars
  • 5 - 5 minute bars
  • 15 - 15 minute bars
  • 30 - 30 minute bars
  • 60 - 1 hour bars
  • 240 - 4 hour bars
  • 1440 - Daily bars
  • 10080 - Weekly bars
  • 21600 - Monthly bars

OHLC Data Format

OHLC data CSV files contain the following columns:

Column Description Example Value
timestamp Candle open time 2024-01-15 14:23:00.000000
open Opening price 42350.50
high Highest price 42375.25
low Lowest price 42340.00
close Closing price 42360.75
vwap Volume weighted average price 42355.12
volume Total volume 15.234567
count Number of trades 127

Sample OHLC data output:

timestamp,open,high,low,close,vwap,volume,count
2024-01-15 14:23:00.000000,42350.50,42375.25,42340.00,42360.75,42355.12,15.234567,127
2024-01-15 14:24:00.000000,42360.75,42380.00,42355.50,42370.25,42368.45,12.567890,98
2024-01-15 14:25:00.000000,42370.25,42385.75,42365.00,42375.50,42374.22,18.901234,156

File Naming and Output

File Naming Convention

Output files follow these patterns:

  • Tick data: kraken_trades_SYMBOL_data_YYYY-MM-DD_to_YYYY-MM-DD.csv
  • OHLC data: kraken_ohlc_SYMBOL_data_YYYY-MM-DD_to_YYYY-MM-DD.csv

Examples:

  • kraken_trades_XBTUSD_data_2024-01-08_to_2024-01-15.csv
  • kraken_ohlc_XBTUSD_data_2024-01-08_to_2024-01-15.csv

Performance Considerations

Memory Usage

The script streams data directly to CSV files rather than storing in memory, allowing collection of unlimited historical data without memory constraints.

File Sizes

Typical file sizes for major pairs:

Tick Data:

  • Bitcoin (XBTUSD): ~50-100MB per day
  • Ethereum (ETHUSD): ~30-60MB per day
  • Altcoins: ~5-20MB per day

OHLC Data (1-minute bars):

  • Bitcoin (XBTUSD): ~2-5MB per day
  • Ethereum (ETHUSD): ~1-3MB per day
  • Altcoins: ~0.5-1MB per day

Collection Speed

With optimal settings:

  • Tick data: ~300,000 trades per hour
  • OHLC data: ~500,000 bars per hour
  • Full week of BTC tick data: ~2-4 hours
  • Full year of BTC 1-minute bars: ~30 minutes

Troubleshooting

Common Issues

Rate Limit Errors

RATE LIMIT: Hit limit (attempt 1), waiting 5 seconds...
  • Solution: Script handles automatically with exponential backoff
  • Prevention: Increase SLEEP_DELAY to 0.1 or higher

Authentication Errors

AUTH ERROR: ['EAPI:Invalid key'] - Check API credentials
  • Solution: Verify API key and secret are correct
  • Check: Ensure API permissions include required access

Network Timeouts

DEBUG: Failed to parse JSON or make request: timeout
  • Solution: Script retries automatically (3 attempts)
  • Prevention: Ensure stable internet connection

Invalid Trading Pair

{'error': ['EQuery:Unknown asset pair']}

Optimization Tips

  1. Set BATCH_SIZE to 5000 for maximum efficiency
  2. Use minimal SLEEP_DELAY (0.01s) for maximum speed
  3. Monitor rate limits - script will auto-adjust if needed
  4. Run during low-activity periods for faster collection

Advanced Usage

Collecting Multiple Pairs and Data Types

Modify the script to loop through multiple configurations:

# Collect both tick and OHLC data for multiple pairs
configs = [
    {"symbol": "XBTUSD", "data_type": "trades"},
    {"symbol": "XBTUSD", "data_type": "ohlc", "interval": 1},
    {"symbol": "ETHUSD", "data_type": "trades"},
    {"symbol": "ETHUSD", "data_type": "ohlc", "interval": 5},
]
for config in configs:
    # Run collection for each configuration

Custom Date Ranges

Set specific start/end dates instead of days lookback:

from datetime import datetime
SINCE = int(datetime(2024, 1, 1).timestamp())  # Start from Jan 1, 2024

Integration with Analysis Tools

The CSV output integrates seamlessly with:

  • Pandas for data analysis
  • NumPy for numerical computation
  • Matplotlib/Plotly for visualization
  • Zipline/Backtrader for backtesting
  • TimescaleDB for time-series storage

Next Steps

Once you have historical data collected:

With Tick Data:

  1. Analyze market microstructure patterns
  2. Build order flow indicators
  3. Develop high-frequency strategies
  4. Create custom aggregations (volume bars, tick bars)
  5. Backtest algorithms with realistic execution modeling

With OHLC Data:

  1. Build technical indicators (RSI, MACD, Bollinger Bands)
  2. Develop swing trading strategies
  3. Create multi-timeframe analysis
  4. Backtest position-based algorithms
  5. Perform statistical analysis of price movements