Error with script to download historical intraday data
-
I've written a script to download free intraday data. The API from which I'm downloading offers two years of data in csv files divided in 24 slices with 30 days divisions begging from the current day. So it's not full months. The problem I have is that after downloading all files for all stocks in all timeframes for a given slice I go to download the next slice and some files aren't being written correctly. Some files have the combined information from both time slices but others have only the headers.
Here is the script, the error is probably around line 66.
import os import io import math import time import os.path import sys import glob import asyncio import requests import traceback import urllib.request import logging as log import pandas as pd from time import sleep from random import randrange from datetime import datetime from typing import List, Dict, Tuple from pathlib import Path from proxybroker import Broker from itertools import cycle log.basicConfig( level=log.DEBUG, format=('%(asctime)s.%(msecs)03d:' '%(levelname)s:' '%(filename)s - %(module)s - %(funcName)s:\t' '%(message)s'), datefmt='%Y-%m-%d %H:%M:%S', handlers=[ log.FileHandler("debug.log"), log.StreamHandler() ] ) apikey = 'XXXXXXXXXXXXXXX' delay = 2 + 0.001 BASE_URL = 'https://www.alphavantage.co/' # To download the data in a subdirectory where the script is located modpath = os.path.dirname(os.path.abspath(sys.argv[0])) def download_previous_data( file: str, ticker: str, timeframe: str, _slice: str, ): global apikey, delay url = f'{BASE_URL}query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol={ticker}&interval={timeframe}&slice={_slice}&apikey={apikey}&datatype=csv' log.info(f'Downloading {_slice} of {timeframe} for {ticker}...') try: while True: t = time.process_time() df = pd.read_csv(url).iloc[::-1] elapsed_time = time.process_time() - t if delay > elapsed_time: sleep(delay - elapsed_time) if len(df.columns) == 6: break sleep(1) if os.path.exists(file): pd.read_csv(file).append(df).drop_duplicates().to_csv(file, index=False, encoding='utf-8-sig') else: df.to_csv(file, index=False, mode='w', encoding='utf-8-sig') except Exception as e: log.info(f"Couldn't download data for {ticker} from {url}") log.error(e, exc_info=True) def get_tickers(filepath) -> List[str]: '''Get a list of all ticker symbols ''' df = pd.read_csv(filepath) #tickers = df.loc[df['exchange'] == 'NYSE']['symbol'].tolist() tickers = df[(df.symbol == 'AN').idxmax():] \ .loc[df['exchange'] == 'NYSE'] \ .loc[df['assetType'] == 'Stock']['symbol'].tolist() return df, tickers def create_download_folders(timeframes: List[str]): for timeframe in timeframes: download_path = f'{modpath}/{timeframe}' #download_path = f'/media/user/Portable Drive/Trading/data/{timeframe}' Path(download_path).mkdir(parents=True, exist_ok=True) def use_stocks_from_file(filepath: str) -> List[str]: filepath = f'{modpath}/{filepath}' with open(filepath) as f: tickers = f.read().replace('\n', '').split(',') return tickers def get_data(): filepath = f'{modpath}/stocks_alphavantage.csv' df, tickers = get_tickers(filepath) timeframes = ['1min', '5min', '15min', '30min', '60min'] create_download_folders(timeframes) slices = ['year2month10', 'year2month9', 'year2month8', 'year2month7', 'year2month6', 'year2month5', 'year2month4', 'year2month3', 'year2month2', 'year2month1', 'year1month12', 'year1month11', 'year1month10', 'year1month9', 'year1month8', 'year1month7', 'year1month6', 'year1month5', 'year1month4', 'year1month3', 'year1month2', 'year1month1'] for _slice in slices: for ticker in tickers: if ticker not in df.values: log.info(f'{ticker} not available. Skiping...') continue name = df.loc[df['symbol'] == ticker, 'name'].iat[0] log.info(f'Downloading data for {ticker}: {name}...') for timeframe in timeframes: download_path = f'{modpath}/{timeframe}' filepath = f'{download_path}/{ticker}.csv' download_previous_data(filepath, ticker, timeframe, _slice) def main(): get_data() if __name__ == '__main__': main()
-
-
But how do I reproduce the bug if I don't know why or when it happens.
-
you already have a script which has potential bug, now you need to figure out why it happens. use more logging or debugging tools, check what goes in, if data received correct, in other words check all links in the chain.