Saturday, 18 May 2013

Scrape Yahoo Finance

#!/bin/sh
#
# scrape_yahoo.sh
#
# This script pulls data from yahoo using a NYSE index list generated
# from scrape_nyse.sh.  It iterates through the list, saving data from each
# security in one big log and seperate per-security log files.  This stuff
# should go into a database, sooner or later.

DEBUG=0
PATH=$PATH:/usr/local/bin
BASEDIR=/home/stockh/stockharmony.com/api/scripts/
SYMBOLS_FILE=${BASEDIR}../data/nyse_index_symbols.txt
LAST_SYMBOL=`tail -1 $SYMBOLS_FILE`
# the SYMBOL string used in a request to yahoo
SYMBOLS=''
# took out "6t" (the URL)
ARG_STRING=`tr -d '
' < ${BASEDIR}../data/yahoo_arg_string_custom.txt `
YAHOO_URL='http://finance.yahoo.com/d/quotes.csv?s='
COUNT=0
# how many symbols should we query in one request?
GET_SYMBOLS=15
SLEEP_TIME=60
BEGIN_TIME=`date +%Y%m%d%H%M%S | tr -d '
'`
DATA_FILE=${BASEDIR}../data/yahoo-finance-${BEGIN_TIME}.csv
TMP_FILE=/tmp/yahoo-data
LOGFILE=${BASEDIR}../logs/scraping
SECS_DIR=${BASEDIR}../data/securities/
GET_YAHOO=0
SELF=`basename $0`
PIDFILE=${BASEDIR}../logs/${SELF}.pid
ERROR_STRING='default error string'

# define functions first, put in include file later

# send sms msg, only once
send_sms_msg () {
if [ "$1" ]; then STRING=$1; fi
if [ $SMS_SENT ]; then
return 0
else
echo $STRING | mailx -s 'stockh error' <a href="mailto:1234567789@cingularme.com">1234567789@cingularme.com</a>
SMS_SENT=1
fi
}

log_error () {
if [ "$1" ]; then
STRING="OOPS: return code $? because $1"
else
STRING="OOPS: return code $?"
fi
date >> $LOGFILE
echo $STRING >> $LOGFILE
}

log_normal () {
if [ "$1" ]; then
STRING="OK: $1"
else
STRING="OK: seems ok $?"
fi
date >> $LOGFILE
echo $STRING >> $LOGFILE
}

save_symbol_data () {
# save once in main file
cat $TMP_FILE >> $DATA_FILE 2>> $LOGFILE
# grep for each symbol and save in seperate files
for j in $SYMBOLS; do
echo -n ${GOT_WHEN}, >> ${SECS_DIR}$j.csv
match=","$j","
grep -i $match $TMP_FILE >> ${SECS_DIR}$j.csv 2>> $LOGFILE
done
}

# Look if any symbols have changed in TMP_FILE, get that data too, append to TMP_FILE.
fetch_changed_symbol_data () {

TMP=`grep '"Ticker symbol has changed to:' $TMP_FILE | sed 's/.*changed to: <a href="/q?s=(.*)">.*/1/'`
TMP=`echo $TMP | tr -d '
'`
TMP=`echo $TMP | sed 's/^s+//g'`
NEW=$TMP
if [ "$NEW" ]; then
log_normal "got changed symbols: $NEW"
# save new syms to global var $SYMBOLS
SYMBOLS="${SYMBOLS} ${NEW}"
TMP=`echo $NEW | tr ' ' '+'`
URL="${YAHOO_URL}${TMP}&f=$ARG_STRING"
# append data to tmp file
lynx -dump $URL >> $TMP_FILE 2>> $LOGFILE
# report errors
if [ $? = 0 ]; then
if [ $DEBUG ]; then log_normal "$URL" ; fi
else
log_error "lynx failed getting changed symbols $URL"
send_sms_msg "lynx failed getting $NEW"
fi
fi

}

# take space seperate list of symbols, query yahoo and save to TMP_FILE
fetch_and_save_symbol_data () {

# replace space with + for URL
TMP=`echo $SYMBOLS | tr ' ' '+'`
URL="${YAHOO_URL}${TMP}&f=$ARG_STRING"
# clobber TMP_FILE with new data
lynx -dump $URL > $TMP_FILE 2>> $LOGFILE
if [ $? = 0 ]; then
if [ $DEBUG ]; then log_normal "$URL" ; fi
GOT_WHEN=`date +%Y%m%d%H%M%S | tr -d '
'`
fetch_changed_symbol_data
# regardless of fetch_changed_symbol_data always save symbol data at
# this point
save_symbol_data
else
log_error "lynx failed getting $URL"
send_sms_msg "lynx failed on $TMP"
fi
}

if [ -f $PIDFILE ]; then
send_sms_msg "$SELF exiting, PID exists"
log_error "$SELF exiting, PID exists"
exit 1
fi

echo $$ > $PIDFILE 2>> $LOGFILE

# save $COUNT amount of symbols in $SYMBOLS then call functions
for i in `cat $SYMBOLS_FILE`; do

COUNT=`expr $COUNT + 1`

if [ "$SYMBOLS" ]; then
SYMBOLS="$SYMBOLS $i"
else
SYMBOLS=$i
fi

if [ $COUNT = $GET_SYMBOLS ]; then
GET_YAHOO=1
elif [ $i = $LAST_SYMBOL ]; then
GET_YAHOO=1
fi

if [ $GET_YAHOO = 1 ]; then
if [ $DEBUG ]; then log_normal "SYMBOLS are $SYMBOLS"; fi
fetch_and_save_symbol_data
sleep $SLEEP_TIME
SYMBOLS=''
COUNT=0
GET_YAHOO=0;
fi

done

echo $SELF started at $BEGIN_TIME >> $LOGFILE  2>&1
echo $SELF finished on `date` >> $LOGFILE 2>&1
wc -l $DATA_FILE >> $LOGFILE 2>&1
wc -l $SYMBOLS_FILE >> $LOGFILE 2>&1
rm -f $PIDFILE >> $LOGFILE 2>&1

if [ $? = 0 ]; then
exit 0
else
send_sms_msg "could not remove PIDFILE. rm returned $?"
exit 1
fi

Source: http://www.snippetsmania.com/scrape-yahoo-finance/

No comments:

Post a Comment