您现在的位置: 网站首页 / 开发之路 / 正文

印钞机の训练数据导出

作者: Azure 发布: 2018-6-14 分类: 开发之路 阅读: 次 查看评论

我尝试用TensorFlow实现一个卷积神经网络来看懂K线走势。一直想验证技术分析是否有效。那么说干就干,从0开始实现一个最强的撸K高手。 


首先遇到的问题就是收集训练数据。还好网上有免费的K线数据可以获取Tushare

我将获取2005年1月1日开始到2018年6月的所有历史沪深300成份股的K线数据(大概758只)作为训练数据。

获取连续32日的OCLHV数据,然后求出32日内的最大值与最小值,将32日的数据全部归一化到0到1的区间中去。

我用第二日开盘价比上前一日开盘价,如果涨幅超过0.14%(平衡一点交易成本)则标签为[1,0],否则标签为[0,1]


于是定义DataFetcher.py将数据自动化抓取到DataDownload目录下去,每只股票一个CSV文件存好:

import tushare as ts
from datetime import date
import pandas as pd
import os
defaultStartDate = '2005-01-01'
defaultEndDate = str(date.today())
isIndex = False
INDEX_LIST = [u'600259', u'002275', u'000829', u'600909', u'002570', u'000413', u'002572', u'000627', u'000156', u'000625', u'000099', u'000623', u'002378', u'000096', u'000629', u'600893', u'600724', u'600895', u'600726', u'000068', u'601229', u'601228', u'600780', u'601717', u'601633', u'002052', u'002051', u'002050', u'601555', u'300251', u'002110', u'601360', u'600153', u'000022', u'600151', u'000024', u'600157', u'000027', u'000029', u'600091', u'601901', u'600096', u'300002', u'600795', u'000157', u'600790', u'601318', u'600655', u'600654', u'600657', u'600653', u'600652', u'000532', u'000533', u'000536', u'600073', u'600072', u'600372', u'600373', u'000927', u'000539', u'000921', u'600377', u'603858', u'002027', u'002024', u'002025', u'600011', u'002028', u'002594', u'002375', u'600383', u'600380', u'002456', u'002450', u'600570', u'600048', u'600010', u'600578', u'600350', u'600472', u'600207', u'600205', u'600200', u'600208', u'000997', u'000999', u'600297', u'601009', u'002624', u'601225', u'601003', u'601002', u'601001', u'600894', u'601006', u'601005', u'600271', u'600270', u'600277', u'600276', u'300072', u'300070', u'002230', u'601186', u'002236', u'600854', u'600851', u'000666', u'000667', u'600859', u'601179', u'000612', u'300104', u'000690', u'000959', u'600550', u'603799', u'600003', u'601699', u'600581', u'600583', u'600582', u'000060', u'000061', u'000063', u'600588', u'002244', u'002242', u'000069', u'002241', u'600611', u'601828', u'600616', u'000100', u'002146', u'601212', u'601211', u'600754', u'600757', u'600038', u'601558', u'600031', u'600030', u'600033', u'600035', u'600037', u'600036', u'600839', u'002411', u'002410', u'002416', u'002415', u'002142', u'002065', u'601288', u'600535', u'601958', u'002097', u'600109', u'002092', u'600104', u'002797', u'600100', u'600103', u'600102', u'600325', u'000400', u'000401', u'000406', u'600320', u'000817', u'600246', u'600977', u'600970', u'000568', u'600978', u'000562', u'002466', u'002465', u'002460', u'601163', u'600118', u'000541', u'002468', u'000961', u'000960', u'000963', u'600403', u'600406', u'600408', u'000968', u'000540', u'000709', u'000708', u'600812', u'600811', u'600816', u'000707', u'000498', u'601139', u'000938', u'002841', u'000792', u'000793', u'300027', u'002081', u'300146', u'300144', u'600108', u'002202', u'002568', u'000021', u'601988', u'601989', u'600150', u'002085', u'000636', u'000631', u'000630', u'601390', u'601398', u'600718', u'600868', u'600866', u'600867', u'600717', u'600710', u'600863', u'601628', u'601328', u'600158', u'601766', u'000402', u'603699', u'601939', u'002106', u'601919', u'601918', u'600143', u'000039', u'601857', u'000598', u'002292', u'002294', u'000031', u'600783', u'600782', u'600376', u'601369', u'600787', u'600786', u'002194', u'002195', u'600660', u'600662', u'600663', u'600664', u'600666', u'600369', u'000520', u'000527', u'600361', u'000528', u'000912', u'600362', u'601838', u'000916', u'000917', u'000088', u'002038', u'600446', u'002429', u'603993', u'300433', u'002422', u'002426', u'002424', u'601566', u'601258', u'600569', u'600161', u'600971', u'600215', u'600216', u'600210', u'600074', u'600219', u'601788', u'000596', u'000983', u'000599', u'002736', u'600926', u'300182', u'002925', u'002739', u'000858', u'600282', u'000538', u'601198', u'600079', u'600886', u'000920', u'600078', u'600823', u'600820', u'600638', u'600827', u'000776', u'000778', u'601299', u'600737', u'601669', u'601101', u'601106', u'601107', u'601108', u'300085', u'002498', u'001979', u'002555', u'002493', u'601238', u'002558', u'601727', u'601216', u'601818', u'002399', u'002252', u'600747', u'600744', u'600628', u'600741', u'600740', u'600627', u'600621', u'600098', u'600748', u'600022', u'000962', u'601618', u'601611', u'600026', u'600027', u'000969', u'002174', u'002073', u'300017', u'002074', u'600489', u'600521', u'000002', u'601966', u'000001', u'600481', u'600482', u'600528', u'600485', u'000008', u'000009', u'600138', u'600703', u'601336', u'601333', u'002400', u'600132', u'002069', u'000828', u'000410', u'600312', u'000415', u'600315', u'600316', u'600317', u'600251', u'600707', u'000822', u'600252', u'000825', u'000826', u'600256', u'002673', u'000671', u'600961', u'600873', u'000550', u'000555', u'002475', u'002470', u'600004', u'600005', u'600006', u'600007', u'600000', u'600001', u'600002', u'600418', u'600415', u'600008', u'000951', u'600410', u'000735', u'000737', u'000733', u'000738', u'600009', u'601668', u'001965', u'000301', u'601021', u'002602', u'002603', u'002601', u'002608', u'600919', u'000780', u'000783', u'000956', u'300015', u'000786', u'601991', u'601992', u'601997', u'601998', u'601666', u'000601', u'300408', u'000607', u'002352', u'002353', u'000703', u'601866', u'600879', u'600508', u'600875', u'600874', u'600871', u'600705', u'600704', u'601158', u'601155', u'601098', u'601099', u'600688', u'002131', u'601877', u'600682', u'600685', u'002508', u'600117', u'600123', u'601928', u'002500', u'600675', u'600674', u'600176', u'600177', u'000046', u'600171', u'002183', u'600779', u'600600', u'601375', u'600770', u'600196', u'000518', u'600190', u'600058', u'600057', u'600061', u'000511', u'600198', u'600050', u'002001', u'002007', u'002008', u'000543', u'600456', u'300024', u'002431', u'601929', u'600518', u'600519', u'600236', u'600516', u'600517', u'600220', u'600221', u'600228', u'601688', u'000581', u'600231', u'002625', u'600958', u'600959', u'000698', u'300058', u'300059', u'600426', u'600357', u'600299', u'000869', u'600352', u'000900', u'000866', u'600296', u'600428', u'000059', u'601898', u'601899', u'600170', u'000939', u'002310', u'000886', u'600832', u'600835', u'000883', u'600837', u'000768', u'000767', u'600838', u'000763', u'000761', u'601111', u'000559', u'600309', u'601117', u'601118', u'601985', u'300122', u'300124', u'000425', u'000932', u'002269', u'601801', u'601800', u'000422', u'002385', u'000089', u'601808', u'600884', u'600639', u'600887', u'600880', u'600881', u'601233', u'600633', u'600631', u'600630', u'600637', u'600635', u'000166', u'601377', u'002299', u'601607', u'601601', u'601600', u'002044', u'002049', u'600497', u'600498', u'600126', u'600125', u'000016', u'600088', u'600089', u'600121', u'000012', u'600085', u'600087', u'601878', u'600436', u'600648', u'600649', u'600642', u'600643', u'600641', u'600066', u'600062', u'000429', u'600060', u'600308', u'600307', u'000937', u'600432', u'000420', u'000933', u'600068', u'000423', u'600998', u'600999', u'600997', u'000930', u'300315', u'600438', u'600398', u'600012', u'000949', u'600546', u'600017', u'600016', u'600015', u'603833', u'600390', u'600019', u'600018', u'600549', u'600548', u'000723', u'000727', u'000726', u'000725', u'000680', u'000682', u'000728', u'000685', u'000686', u'000338', u'000333', u'601018', u'600900', u'600399', u'601016', u'300168', u'600547', u'601012', u'600266', u'600267', u'000831', u'300003', u'600269', u'000839', u'002344', u'601718', u'603160', u'603288', u'000750', u'000751', u'000617', u'600460', u'000758', u'600849', u'002078', u'000718', u'600395', u'601168', u'601169', u'002839', u'600300', u'601088', u'002831', u'601166', u'002122', u'601231', u'600690', u'002129', u'002128', u'600694', u'601933', u'000712', u'600591', u'600596', u'600597', u'600595', u'600598', u'600602', u'600169', u'600601', u'600606', u'600608', u'600160', u'600166', u'601872', u'601268', u'600761', u'000729', u'600739', u'600028', u'600029', u'600183', u'600522', u'000503', u'600023', u'600020', u'600021', u'601519', u'600188', u'600025', u'002405', u'002155', u'002152', u'002153', u'600797', u'603260', u'601588', u'601969', u'600501', u'600500', u'600115', u'600110', u'600111', u'600332', u'600333', u'600239', u'600331', u'000806', u'000807', u'600233', u'000800', u'600339', u'601608', u'600487', u'000572', u'000573', u'000571', u'002714', u'600834', u'002653', u'603885', u'601881', u'000970', u'600340', u'000977', u'000975', u'600346', u'600348', u'000876', u'000875', u'000652', u'000651', u'601888', u'000656', u'002083', u'000659', u'002304', u'000895', u'603000', u'000897', u'600808', u'600809', u'300136', u'600804', u'600805', u'000717', u'000488', u'000898', u'601127', u'000066', u'000618', u'600135', u'000823', u'300033', u'000755', u'000036', u'000507', u'600585', u'300133', u'000878']

startDate = input("Input Start Date (%s) : " %defaultStartDate)
endDate = input("Input End Date (%s) : " %defaultEndDate)

startIdx1 = input("Input Start Num (0): ")
if startIdx1=='':
    startIdx = 0
else:
    startIdx = int(startIdx1)
lines = 0
upcount = 0
downcount = 0
currIdx = 0
for code in INDEX_LIST:
    
    print('Curr Processing Index : %d' %currIdx)
    
    if currIdx < startIdx:
        currIdx+=1
        continue
    
    currIdx+=1
    try:
       
        if startDate=="":
            startDate = defaultStartDate
        if endDate=="":
            endDate = defaultEndDate
            
        outputFileName = 'DataDownload/Data_%s_(%s)_(%s).csv' %(code, startDate, endDate)
        print('Data Download to : %s' %outputFileName)
        
        if os.path.exists(outputFileName):
            print('%s already exist! skip it! ...' %outputFileName)
            continue
        
        df = ts.get_k_data(code, index=isIndex, start=str(startDate), end=str(endDate))
        del df['date']
        del df['code']
        df = df.reset_index(drop=True)
        
        rows = len(df)
        
        print('Data has %d Rows ... ' %rows)
        
        #data process
        TRAIN_DAYS = 32
        PERDICT_DAYS = 1
        RANGE_END = rows-TRAIN_DAYS-PERDICT_DAYS+1-1
        
        if rows<=TRAIN_DAYS+(PERDICT_DAYS-1):
            continue
            
        table = pd.DataFrame()
        for i in range(0, TRAIN_DAYS):
            colname1 = 'O%d' %i
            colname2 = 'C%d' %i
            colname3 = 'H%d' %i
            colname4 = 'L%d' %i
            colname5 = 'V%d' %i
            table[colname1] = None
            table[colname2] = None
            table[colname3] = None
            table[colname4] = None
            table[colname5] = None
        table['UP'] = None
        table['DOWN'] = None
                    
        checkall = df['open']
        checkall1 = df['open']
                
        for i in range(0, RANGE_END):
           
            check_next = float(checkall[i+TRAIN_DAYS+PERDICT_DAYS])
            check_prev = float(checkall1[i+TRAIN_DAYS])
            
            growth = check_next / check_prev - 1
            kdatapart = df[i:i+TRAIN_DAYS]
            kdatapart = kdatapart.reset_index(drop=True)
            
            #print(kdatapart)
            #print(check_next,check_prev,growth)
            lowlist = []
            highlist = []
            volumelist = []
            feeddata = []
            
            lowpart = kdatapart['low']
            highpart = kdatapart['high']
            volpart = kdatapart['volume']
            openpart = kdatapart['open']
            closepart = kdatapart['close']
            
            lenkpart = len(kdatapart)
            for j in range(0, lenkpart):
                lowlist.append(float(lowpart[j]))
                highlist.append(float(highpart[j]))
                volumelist.append(float(volpart[j]))
            low_min = min(lowlist)
            low_max = max(highlist)
            volume_min = min(volumelist)
            volume_max = max(volumelist)
            for j in range(0, lenkpart):
                fopen = float(openpart[j])
                fclose = float(closepart[j])
                fhigh = float(highpart[j])
                flow = float(lowpart[j])
                fvolume = float(volpart[j])
                unified_open = (fopen-low_min)/(low_max-low_min)
                unified_close = (fclose-low_min)/(low_max-low_min)
                unified_high = (fhigh-low_min)/(low_max-low_min)
                unified_low = (flow-low_min)/(low_max-low_min)
                unified_vol = (fvolume-volume_min)/(volume_max-volume_min)
                feeddata.append(unified_open)
                feeddata.append(unified_close)
                feeddata.append(unified_high)
                feeddata.append(unified_low)
                feeddata.append(unified_vol)
            up = 1.0
            down = 0.0
            if growth*100.0>0.14:
                up = 1.0
                down = 0.0
                upcount += 1
            else:
                up = 0.0
                down = 1.0
                downcount += 1
                
            feeddata.append(up)
            feeddata.append(down)
            
            table.loc[len(table.index)] = feeddata
            
            lines += 1
            
            if i%100==0:
                percent = i / float(RANGE_END) * 100.0
                print("Exporting %0.2f%% Lines:%d Up:%d Down:%d..." %(percent, lines, upcount, downcount))
        
        print('(%d/%d) Saving To %s ...' %(currIdx, len(INDEX_LIST), outputFileName))
        
        table = table.dropna(axis=0, how='any')
        table = table.drop_duplicates()
        
        table.to_csv(path_or_buf=outputFileName, index=False, header=True)
        
        print('Data Saved ...')
        
    except:
        continue



下一篇 » 原创文章,转载请注明出处!标签: TensorFlow  

控制面板
网站分类
搜索
最新留言
友情链接