finlab.ml

finlab.ml.feature

combine

combine(features, resample=None, sample_filter=None, **kwargs)

The combine function takes a dictionary of features as input and combines them into a single pandas DataFrame. combine 函數接受一個特徵字典作為輸入，並將它們合併成一個 pandas DataFrame。

PARAMETER	DESCRIPTION
`features`	a dictionary of features where index is datetime and column is instrument. 一個特徵字典，其中索引為日期時間，欄位為證券代碼。 TYPE: `Dict[str, DataFrame]`
`resample`	Optional argument to resample the data in the features. Default is None. 選擇性的參數，用於重新取樣特徵中的資料。預設為 None。 TYPE: `str` DEFAULT: `None`
`sample_filter`	a boolean dictionary where index is date and columns are instrument representing the filter of features. TYPE: `DataFrame` DEFAULT: `None`
`**kwargs`	Additional keyword arguments to pass to the resampler function. 傳遞給重新取樣函數 resampler 的其他關鍵字引數。 DEFAULT: `{}`

RETURNS	DESCRIPTION
	A pandas DataFrame containing all the input features combined. 一個包含所有輸入特徵合併後的 pandas DataFrame。

Examples:

這段程式碼教我們如何使用finlab.ml.feature和finlab.data模組，來合併兩個特徵：RSI和股價淨值比。我們使用f.combine函數來進行合併，其中特徵的名稱是字典的鍵，對應的資料是值。我們從data.indicator('RSI')取得'rsi'特徵，這個函數計算相對強弱指數。我們從data.get('price_earning_ratio:股價淨值比')取得'pb'特徵，這個函數獲取股價淨值比。最後，我們得到一個包含這兩個特徵的DataFrame。

from finlab import data
import finlab.ml.feature as f
import finlab.ml.qlib as q

features = f.combine({

    # 用 data.get 簡單產生出技術指標
    'pb': data.get('price_earning_ratio:股價淨值比'),

    # 用 data.indicator 產生技術指標的特徵
    'rsi': data.indicator('RSI'),

    # 用 f.ta 枚舉超多種 talib 指標
    'talib': f.ta(f.ta_names()),

    # 利用 qlib alph158 產生技術指標的特徵(請先執行 q.init(), q.dump() 才能使用)
    'qlib158': q.alpha('Alpha158')

    })

features.head()

datetime	instrument	rsi	pb
2020-01-01	1101	0	2
2020-01-02	1102	100	3
2020-01-03	1108	100	4

Source code in finlab/ml/feature.py

def combine(features:Dict[str, pd.DataFrame], resample=None, sample_filter=None, **kwargs):

    """The combine function takes a dictionary of features as input and combines them into a single pandas DataFrame. combine 函數接受一個特徵字典作為輸入，並將它們合併成一個 pandas DataFrame。

    Args:
        features (Dict[str, pd.DataFrame]): a dictionary of features where index is datetime and column is instrument. 一個特徵字典，其中索引為日期時間，欄位為證券代碼。
        resample (str): Optional argument to resample the data in the features. Default is None. 選擇性的參數，用於重新取樣特徵中的資料。預設為 None。
        sample_filter (pd.DataFrame): a boolean dictionary where index is date and columns are instrument representing the filter of features.
        **kwargs: Additional keyword arguments to pass to the resampler function. 傳遞給重新取樣函數 resampler 的其他關鍵字引數。

    Returns:
        A pandas DataFrame containing all the input features combined. 一個包含所有輸入特徵合併後的 pandas DataFrame。

    Examples:
        這段程式碼教我們如何使用finlab.ml.feature和finlab.data模組，來合併兩個特徵：RSI和股價淨值比。我們使用f.combine函數來進行合併，其中特徵的名稱是字典的鍵，對應的資料是值。
        我們從data.indicator('RSI')取得'rsi'特徵，這個函數計算相對強弱指數。我們從data.get('price_earning_ratio:股價淨值比')取得'pb'特徵，這個函數獲取股價淨值比。最後，我們得到一個包含這兩個特徵的DataFrame。

        ``` py
        from finlab import data
        import finlab.ml.feature as f
        import finlab.ml.qlib as q

        features = f.combine({

            # 用 data.get 簡單產生出技術指標
            'pb': data.get('price_earning_ratio:股價淨值比'),

            # 用 data.indicator 產生技術指標的特徵
            'rsi': data.indicator('RSI'),

            # 用 f.ta 枚舉超多種 talib 指標
            'talib': f.ta(f.ta_names()),

            # 利用 qlib alph158 產生技術指標的特徵(請先執行 q.init(), q.dump() 才能使用)
            'qlib158': q.alpha('Alpha158')

            })

        features.head()
        ```

        |    datetime   | instrument |     rsi    |     pb     |
        |---------------|------------|------------|------------|
        |   2020-01-01  |    1101    |     0      |     2      |
        |   2020-01-02  |    1102    |     100    |     3      |
        |   2020-01-03  |    1108    |     100    |     4      |

    """

    if len(features) == 0:
        return pd.DataFrame()

    def resampling(df) -> pd.DataFrame:
        return resampler(df, resample, **kwargs)

    unstacked = {}

    union_index = None
    union_columns = None
    unstacked = {}
    concats = []

    for name, df in features.items():

        if isinstance(df.index, pd.MultiIndex):
            concats.append(df)
        else:
            if isinstance(df, FinlabDataFrame):
                df = df.index_str_to_date()

            udf = resampling(df)
            unstacked[name] = udf
            if union_index is not None:
                union_index = union_index.union(udf.index)
            else:
                union_index = udf.index
            if union_columns is not None:
                union_columns = union_columns.intersection(udf.columns)
            else:
                union_columns = udf.columns

    final_index = None
    for name, udf in unstacked.items():
        udf = udf\
            .reindex(index=union_index, columns=union_columns)\
            .ffill()\
            .T\
            .unstack()
        unstacked[name] = udf.values

        if final_index is None:
            final_index = udf.index

    for i, c in enumerate(concats):
        c.index = c.index.set_names(['datetime', 'instrument'])
        if union_index is not None:
            concats[i] = c[c.index.get_level_values('datetime').isin(union_index)]

    if unstacked:
        unstack_df = pd.DataFrame(unstacked, index=final_index)
        # unstack_df = unstack_df.swaplevel(0, 1)
        unstack_df.index = unstack_df.index.set_names(['datetime', 'instrument'])
        concats.append(unstack_df)

    ret = pd.concat(concats, axis=1)
    ret.sort_index(inplace=True)

    if sample_filter is not None:
        if isinstance(sample_filter, FinlabDataFrame):
            sample_filter = sample_filter.index_str_to_date()
        usf = resampling(sample_filter)

        if union_index is not None and union_columns is not None:
            usf = usf.reindex(index=union_index, columns=union_columns)

        usf = usf.ffill()\
           .T\
           .unstack()\
           .reindex(ret.index).fillna(False)
        ret = ret[usf.values]

    return ret

ta

ta(feature_names, factories=None, resample=None, start_time=None, end_time=None, adj=False, cpu=-1, **kwargs)

Calculate technical indicator values for a list of feature names.

PARAMETER	DESCRIPTION
`feature_names`	A list of technical indicator feature names. Defaults to None. TYPE: `Optional[List[str]]`
`factories`	A dictionary of factories to generate technical indicators. Defaults to {"talib": TalibIndicatorFactory()}. TYPE: `Optioanl[Dict[str, TalibIndicatorFactory]]` DEFAULT: `None`
`resample`	The frequency to resample the data to. Defaults to None. TYPE: `Optional[str]` DEFAULT: `None`
`start_time`	The start time of the data. Defaults to None. TYPE: `Optional[str]` DEFAULT: `None`
`end_time`	The end time of the data. Defaults to None. TYPE: `Optional[str]` DEFAULT: `None`
`**kwargs`	Additional keyword arguments to pass to the resampler function. DEFAULT: `{}`

RETURNS	DESCRIPTION
`DataFrame`	pd.DataFrame: technical indicator feature names and their corresponding values.

Source code in finlab/ml/feature.py

def ta(feature_names:Optional[List[str]], 
       factories=None,
       resample=None, 
       start_time=None, 
       end_time=None, 
       adj=False,
       cpu=-1,
       **kwargs) -> pd.DataFrame:
    """Calculate technical indicator values for a list of feature names.

    Args:
        feature_names (Optional[List[str]]): A list of technical indicator feature names. Defaults to None.
        factories (Optioanl[Dict[str, TalibIndicatorFactory]]): A dictionary of factories to generate technical indicators. Defaults to {"talib": TalibIndicatorFactory()}.
        resample (Optional[str]): The frequency to resample the data to. Defaults to None.
        start_time (Optional[str]): The start time of the data. Defaults to None.
        end_time (Optional[str]): The end time of the data. Defaults to None.
        **kwargs: Additional keyword arguments to pass to the resampler function.

    Returns:
        pd.DataFrame: technical indicator feature names and their corresponding values.
    """

    if factories is None:
        factories = {'talib':TalibIndicatorFactory()}

    if feature_names is None:
        feature_names = ta_names()

    if cpu == -1:
        import multiprocessing
        cpu = multiprocessing.cpu_count()

    if cpu == 1:
        market = ml.get_market()
    else:
        market = finlab.market_info.MarketInfoSharedMemory(ml.get_market(), adj=adj, start_time=start_time, end_time=end_time)

    test_f = resampler(TalibIndicatorFactory().calculate_indicator("RSI", 'real', {}, adj=adj, market=market), 
                       resample, **kwargs).T.unstack()

    final_columns = []

    def create_features() -> Generator[np.ndarray, None, None]:

        nonlocal final_columns

        if cpu == 1:
            for name in feature_names:

                # parallel processing wrapper function
                # name, values = create_feature((name, factories, resample, end_time, adj, kwargs) + tuple([market.to_args()]))

                # single processing
                values = resampler(factory.calculate_indicator(func, output, params, adj=adj, market=market), resample, **kwargs).T.unstack()
                if values is not None:
                    final_columns.append(name)
                    yield values
        else:
            import multiprocessing
            with multiprocessing.Pool(processes=cpu) as pool:

                for result in pool.imap_unordered(
                                            create_feature, 
                                            [(name, factories, resample, end_time, adj, kwargs) + tuple([market.to_args()])
                                            for name in feature_names]):
                    if result is not None:
                        name, values = result
                        final_columns.append(name)
                        yield values


    values = np.fromiter(
            create_features(), 
            dtype=np.dtype((np.float64, len(test_f))))

    market.close()

    final_names = set(final_columns)
    ordered_names = [n for n in feature_names if n in final_names]

    ret = pd.DataFrame(values.T, index=test_f.index, 
                       columns=final_columns, copy=False)

    ret.index.names = ['datetime', 'instrument']
    return ret[ordered_names]

ta_names

ta_names(lb=1, ub=10, n=1, factory=None)

Generate a list of technical indicator feature names.

PARAMETER	DESCRIPTION
`lb`	The lower bound of the multiplier of the default parameter for the technical indicators. TYPE: `int` DEFAULT: `1`
`ub`	The upper bound of the multiplier of the default parameter for the technical indicators. TYPE: `int` DEFAULT: `10`
`n`	The number of random samples for each technical indicator. TYPE: `int` DEFAULT: `1`
`factory`	A factory object to generate technical indicators. Defaults to TalibIndicatorFactory. TYPE: `IndicatorFactory` DEFAULT: `None`

RETURNS	DESCRIPTION
`List[str]`	List[str]: A list of technical indicator feature names.

Examples:

import finlab.ml.feature as f


# method 1: generate each indicator with random parameters
features = f.ta()

# method 2: generate specific indicator
feature_names = ['talib.MACD__macdhist__fastperiod__52__slowperiod__212__signalperiod__75__']
features = f.ta(feature_names, resample='W')

# method 3: generate some indicator
feature_names = f.ta_names()
features = f.ta(feature_names)

Source code in finlab/ml/feature.py

def ta_names(lb:int=1, ub:int=10, n:int=1, factory=None) -> List[str]:
    """
    Generate a list of technical indicator feature names.

    Args:
        lb (int): The lower bound of the multiplier of the default parameter for the technical indicators.
        ub (int): The upper bound of the multiplier of the default parameter for the technical indicators.
        n (int): The number of random samples for each technical indicator.
        factory (IndicatorFactory): A factory object to generate technical indicators.
            Defaults to TalibIndicatorFactory.

    Returns:
        List[str]: A list of technical indicator feature names.

    Examples:
        ```py
        import finlab.ml.feature as f


        # method 1: generate each indicator with random parameters
        features = f.ta()

        # method 2: generate specific indicator
        feature_names = ['talib.MACD__macdhist__fastperiod__52__slowperiod__212__signalperiod__75__']
        features = f.ta(feature_names, resample='W')

        # method 3: generate some indicator
        feature_names = f.ta_names()
        features = f.ta(feature_names)
        ```
    """

    if factory is None:
        factory = TalibIndicatorFactory()

    return sum([factory.generate_feature_names(f, lb, ub, n) for f in factory.all_functions()], [])

finlab.ml.label

daytrading_percentage

daytrading_percentage(index, **kwargs)

Calculate the percentage change of market prices over a given period.

PARAMETER	DESCRIPTION
`index`	A multi-level index of datetime and instrument. TYPE: `Index`
`resample`	The resample frequency for the output data. Defaults to None. TYPE: `Optional[str]`
`period`	The number of periods to calculate the percentage change over. Defaults to 1. TYPE: `int`
`trade_at_price`	The price for execution. Defaults to `close`. TYPE: `str`
`**kwargs`	Additional arguments to be passed to the resampler function. DEFAULT: `{}`

RETURNS	DESCRIPTION
	pd.Series: A pd.Series containing the percentage change of stock prices.

Source code in finlab/ml/label.py

def daytrading_percentage(index: pd.Index, **kwargs):
    """Calculate the percentage change of market prices over a given period.

    Args:
        index (pd.Index): A multi-level index of datetime and instrument.
        resample (Optional[str]): The resample frequency for the output data. Defaults to None.
        period (int): The number of periods to calculate the percentage change over. Defaults to 1.
        trade_at_price (str): The price for execution. Defaults to `close`.
        **kwargs: Additional arguments to be passed to the resampler function.

    Returns:
        pd.Series: A pd.Series containing the percentage change of stock prices.

    """

    market = ml.get_market()
    assert market is not None

    adj_close = market.get_price('close', adj=True).shift(-1)
    adj_open = market.get_price('open', adj=True).shift(-1)
    uadj_close = resampler(adj_close, 'D', **kwargs)
    uadj_open = resampler(adj_open, 'D', **kwargs)

    ret = (uadj_close / uadj_open) - 1
    return align_to_feature(index, ret)

excess_over_mean

excess_over_mean(index, resample=None, period=1, trade_at_price='close', **kwargs)

Calculate the excess over mean of market prices over a given period.

PARAMETER	DESCRIPTION
`index`	A multi-level index of datetime and instrument. TYPE: `Index`
`resample`	The resample frequency for the output data. Defaults to None. TYPE: `Optional[str]` DEFAULT: `None`
`period`	The number of periods to calculate the percentage change over. Defaults to 1. TYPE: `int` DEFAULT: `1`
`trade_at_price`	The price for execution. Defaults to `close`. TYPE: `str` DEFAULT: `'close'`
`**kwargs`	Additional arguments to be passed to the resampler function. DEFAULT: `{}`

RETURNS	DESCRIPTION
	pd.Series: A pd.Series containing the percentage change of stock prices.

Source code in finlab/ml/label.py

def excess_over_mean(index: pd.Index, resample=None, period=1, trade_at_price='close', **kwargs):

    """Calculate the excess over mean of market prices over a given period.

    Args:
        index (pd.Index): A multi-level index of datetime and instrument.
        resample (Optional[str]): The resample frequency for the output data. Defaults to None.
        period (int): The number of periods to calculate the percentage change over. Defaults to 1.
        trade_at_price (str): The price for execution. Defaults to `close`.
        **kwargs: Additional arguments to be passed to the resampler function.

    Returns:
        pd.Series: A pd.Series containing the percentage change of stock prices.

    """
    market = ml.get_market()
    adj = market.get_price(trade_at_price, adj=True).shift(-1)
    uadj = resampler(adj, resample, **kwargs)
    ret = (uadj.shift(-period) / uadj) - 1
    ret -= ret.mean(axis=1)
    return align_to_feature(index, ret)

excess_over_median

excess_over_median(index, resample=None, period=1, trade_at_price='close', **kwargs)

Calculate the excess over median of market prices over a given period.

PARAMETER	DESCRIPTION
`index`	A multi-level index of datetime and instrument. TYPE: `Index`
`resample`	The resample frequency for the output data. Defaults to None. TYPE: `Optional[str]` DEFAULT: `None`
`period`	The number of periods to calculate the percentage change over. Defaults to 1. TYPE: `int` DEFAULT: `1`
`trade_at_price`	The price for execution. Defaults to `close`. TYPE: `str` DEFAULT: `'close'`
`**kwargs`	Additional arguments to be passed to the resampler function. DEFAULT: `{}`

RETURNS	DESCRIPTION
	pd.Series: A pd.Series containing the percentage change of stock prices.

Source code in finlab/ml/label.py

def excess_over_median(index: pd.Index, resample=None, period=1, trade_at_price='close', **kwargs):

    """Calculate the excess over median of market prices over a given period.

    Args:
        index (pd.Index): A multi-level index of datetime and instrument.
        resample (Optional[str]): The resample frequency for the output data. Defaults to None.
        period (int): The number of periods to calculate the percentage change over. Defaults to 1.
        trade_at_price (str): The price for execution. Defaults to `close`.
        **kwargs: Additional arguments to be passed to the resampler function.

    Returns:
        pd.Series: A pd.Series containing the percentage change of stock prices.

    """

    market = ml.get_market()
    adj = market.get_price(trade_at_price, adj=True).shift(-1)
    uadj = resampler(adj, resample, **kwargs)
    ret = (uadj.shift(-period) / uadj) - 1
    ret -= ret.median(axis=1)
    return align_to_feature(index, ret)

maximum_adverse_excursion

maximum_adverse_excursion(index, period=1, trade_at_price='close')

Calculate the maximum adverse excursion of market prices over a given period.

PARAMETER	DESCRIPTION
`index`	A multi-level index of datetime and instrument. TYPE: `Index`
`resample`	The resample frequency for the output data. Defaults to None. TYPE: `Optional[str]`
`period`	The number of periods to calculate the percentage change over. Defaults to 1. TYPE: `int` DEFAULT: `1`
`trade_at_price`	The price for execution. Defaults to `close`. TYPE: `str` DEFAULT: `'close'`
`**kwargs`	Additional arguments to be passed to the resampler function.

RETURNS	DESCRIPTION
	pd.Series: A pd.Series containing the percentage change of stock prices.

Source code in finlab/ml/label.py

def maximum_adverse_excursion(index: pd.Index, period=1, trade_at_price='close'):

    """Calculate the maximum adverse excursion of market prices over a given period.

    Args:
        index (pd.Index): A multi-level index of datetime and instrument.
        resample (Optional[str]): The resample frequency for the output data. Defaults to None.
        period (int): The number of periods to calculate the percentage change over. Defaults to 1.
        trade_at_price (str): The price for execution. Defaults to `close`.
        **kwargs: Additional arguments to be passed to the resampler function.

    Returns:
        pd.Series: A pd.Series containing the percentage change of stock prices.

    """

    market = ml.get_market()
    assert market is not None
    adj = market.get_price(trade_at_price, adj=True).shift(-1)
    ret = adj.shift(-period).rolling(period).min() / adj - 1
    ret = ret.reindex(index.levels[0], method='ffill')
    return align_to_feature(index, ret)

maximum_favorable_excursion

maximum_favorable_excursion(index, period=1, trade_at_price='close')

Calculate the maximum favorable excursion of market prices over a given period.

PARAMETER	DESCRIPTION
`index`	A multi-level index of datetime and instrument. TYPE: `Index`
`resample`	The resample frequency for the output data. Defaults to None. TYPE: `Optional[str]`
`period`	The number of periods to calculate the percentage change over. Defaults to 1. TYPE: `int` DEFAULT: `1`
`trade_at_price`	The price for execution. Defaults to `close`. TYPE: `str` DEFAULT: `'close'`
`**kwargs`	Additional arguments to be passed to the resampler function.

RETURNS	DESCRIPTION
	pd.Series: A pd.Series containing the percentage change of stock prices.

Source code in finlab/ml/label.py

def maximum_favorable_excursion(index: pd.Index, period=1, trade_at_price='close'):

    """Calculate the maximum favorable excursion of market prices over a given period.

    Args:
        index (pd.Index): A multi-level index of datetime and instrument.
        resample (Optional[str]): The resample frequency for the output data. Defaults to None.
        period (int): The number of periods to calculate the percentage change over. Defaults to 1.
        trade_at_price (str): The price for execution. Defaults to `close`.
        **kwargs: Additional arguments to be passed to the resampler function.

    Returns:
        pd.Series: A pd.Series containing the percentage change of stock prices.

    """

    market = ml.get_market()
    assert market is not None
    adj = market.get_price(trade_at_price, adj=True).shift(-1)
    ret = adj.shift(-period).rolling(period).max() / adj - 1
    ret = ret.reindex(index.levels[0], method='ffill')
    return align_to_feature(index, ret)

return_percentage

return_percentage(index, resample=None, period=1, trade_at_price='close', **kwargs)

Calculate the percentage change of market prices over a given period.

PARAMETER	DESCRIPTION
`index`	A multi-level index of datetime and instrument. TYPE: `Index`
`resample`	The resample frequency for the output data. Defaults to None. TYPE: `Optional[str]` DEFAULT: `None`
`period`	The number of periods to calculate the percentage change over. Defaults to 1. TYPE: `int` DEFAULT: `1`
`trade_at_price`	The price for execution. Defaults to `close`. TYPE: `str` DEFAULT: `'close'`
`**kwargs`	Additional arguments to be passed to the resampler function. DEFAULT: `{}`

RETURNS	DESCRIPTION
	pd.Series: A pd.Series containing the percentage change of stock prices.

Source code in finlab/ml/label.py

def return_percentage(index: pd.Index, resample=None, period=1, trade_at_price='close', **kwargs):

    """Calculate the percentage change of market prices over a given period.

    Args:
        index (pd.Index): A multi-level index of datetime and instrument.
        resample (Optional[str]): The resample frequency for the output data. Defaults to None.
        period (int): The number of periods to calculate the percentage change over. Defaults to 1.
        trade_at_price (str): The price for execution. Defaults to `close`.
        **kwargs: Additional arguments to be passed to the resampler function.

    Returns:
        pd.Series: A pd.Series containing the percentage change of stock prices.

    """

    market = ml.get_market()
    assert market is not None

    adj = market.get_price(trade_at_price, adj=True).shift(-1)
    uadj = resampler(adj, resample, **kwargs)
    ret = (uadj.shift(-period) / uadj) - 1
    return align_to_feature(index, ret)

finlab.ml.qlib

DumpDataBase

DumpDataBase(csv_path, qlib_dir, backup_dir=None, freq='day', max_workers=16, date_field_name='date', file_suffix='.csv', symbol_field_name='symbol', exclude_fields='', include_fields='', limit_nums=None)

Base class for dumping data to Qlib format.

PARAMETER	DESCRIPTION
`csv_path`	The path to the CSV file or directory containing the CSV files. TYPE: `str`
`qlib_dir`	The directory where the Qlib data will be saved. TYPE: `str`
`backup_dir`	The directory where the backup of the Qlib data will be saved. Defaults to None. TYPE: `str` DEFAULT: `None`
`freq`	The frequency of the data. Defaults to "day". TYPE: `str` DEFAULT: `'day'`
`max_workers`	The maximum number of workers for parallel processing. Defaults to 16. TYPE: `int` DEFAULT: `16`
`date_field_name`	The name of the date field in the CSV file. Defaults to "date". TYPE: `str` DEFAULT: `'date'`
`file_suffix`	The suffix of the CSV file. Defaults to ".csv". TYPE: `str` DEFAULT: `'.csv'`
`symbol_field_name`	The name of the symbol field in the CSV file. Defaults to "symbol". TYPE: `str` DEFAULT: `'symbol'`
`exclude_fields`	The fields to exclude from the dumped data. Defaults to "". TYPE: `str` DEFAULT: `''`
`include_fields`	The fields to include in the dumped data. Defaults to "". TYPE: `str` DEFAULT: `''`
`limit_nums`	The maximum number of CSV files to process. Defaults to None. TYPE: `int` DEFAULT: `None`

Source code in finlab/ml/qlib.py

def __init__(
    self,
    csv_path: str,
    qlib_dir: str,
    backup_dir: str = None,
    freq: str = "day",
    max_workers: int = 16,
    date_field_name: str = "date",
    file_suffix: str = ".csv",
    symbol_field_name: str = "symbol",
    exclude_fields: str = "",
    include_fields: str = "",
    limit_nums: int = None,
):
    csv_path = Path(csv_path).expanduser()
    if isinstance(include_fields, str):
        include_fields = include_fields.split(",")
    self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields)))
    self.file_suffix = file_suffix
    self.symbol_field_name = symbol_field_name
    self.csv_files = sorted(csv_path.glob(f"*{self.file_suffix}") if csv_path.is_dir() else [csv_path])
    if limit_nums is not None:
        self.csv_files = self.csv_files[: int(limit_nums)]
    self.qlib_dir = Path(qlib_dir).expanduser()
    self.backup_dir = backup_dir if backup_dir is None else Path(backup_dir).expanduser()

    self.freq = freq
    self.calendar_format = self.DAILY_FORMAT if self.freq == "day" else self.HIGH_FREQ_FORMAT

    self.works = max_workers
    self.date_field_name = date_field_name

    self._calendars_dir = self.qlib_dir.joinpath(self.CALENDARS_DIR_NAME)
    self._features_dir = self.qlib_dir.joinpath(self.FEATURES_DIR_NAME)
    self._instruments_dir = self.qlib_dir.joinpath(self.INSTRUMENTS_DIR_NAME)

    self._calendars_list = []

    self._mode = self.ALL_MODE
    self._kwargs = {}

CatBoostModel

CatBoostModel()

CatBoostModel is a wrapper model for CatBoost model.

import finlab.ml.qlib as q

# build X_train, y_train, X_test

model = q.CatBoostModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Source code in finlab/ml/qlib.py

def CatBoostModel():
    """
    CatBoostModel is a wrapper model for CatBoost model.
    ```py
    import finlab.ml.qlib as q

    # build X_train, y_train, X_test

    model = q.CatBoostModel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ```
    """
    return WrapperModel(yaml.safe_load("""
class: CatBoostModel
module_path: qlib.contrib.model.catboost_model
kwargs:
    loss: RMSE
    learning_rate: 0.0421
    subsample: 0.8789
    max_depth: 6
    num_leaves: 100
    thread_count: 20
    grow_policy: Lossguide
"""))

DEnsmbleModel

DEnsmbleModel()

DEnsmbleModel is a wrapper model for Double Ensemble model.

import finlab.ml.qlib as q

# build X_train, y_train, X_test

model = q.DEnsmbleModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Source code in finlab/ml/qlib.py

def DEnsmbleModel():
    """
    DEnsmbleModel is a wrapper model for Double Ensemble model.
    ```py
    import finlab.ml.qlib as q

    # build X_train, y_train, X_test

    model = q.DEnsmbleModel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ```
    """
    return WrapperModel(yaml.safe_load("""
class: DEnsembleModel
module_path: qlib.contrib.model.double_ensemble
kwargs:
    base_model: "gbm"
    loss: mse
    num_models: 3
    enable_sr: True
    enable_fs: True
    alpha1: 1
    alpha2: 1
    bins_sr: 10
    bins_fs: 5
    decay: 0.5
    sample_ratios:
        - 0.8
        - 0.7
        - 0.6
        - 0.5
        - 0.4
    sub_weights:
        - 1
        - 1
        - 1
    epochs: 28
    colsample_bytree: 0.8879
    learning_rate: 0.0421
    subsample: 0.8789
    lambda_l1: 205.6999
    lambda_l2: 580.9768
    max_depth: 8
    num_leaves: 210
    num_threads: 20
    verbosity: -1
"""))

DNNModel

DNNModel()

DNNModel is a wrapper model for Deep Neural Network model.

import finlab.ml.qlib as q

# build X_train, y_train, X_test

model = q.DNNModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Source code in finlab/ml/qlib.py

def DNNModel():
    """
    DNNModel is a wrapper model for Deep Neural Network model.
    ```py
    import finlab.ml.qlib as q

    # build X_train, y_train, X_test

    model = q.DNNModel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ```
    """
    return WrapperModel(yaml.safe_load("""
class: DNNModelPytorch
module_path: qlib.contrib.model.pytorch_nn
kwargs:
    loss: mse
    lr: 0.002
    optimizer: adam
    max_steps: 8000
    batch_size: 8192
    GPU: 0
    weight_decay: 0.0002
    pt_model_kwargs:
      input_dim: 8
"""))

LGBModel

LGBModel()

LGBModel is a wrapper model for LightGBM model.

import finlab.ml.qlib as q

# build X_train, y_train, X_test

model = q.LGBModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Source code in finlab/ml/qlib.py

def LGBModel():
    """LGBModel is a wrapper model for LightGBM model.
    ```py
    import finlab.ml.qlib as q

    # build X_train, y_train, X_test

    model = q.LGBModel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ```
    """
    return  WrapperModel(yaml.safe_load("""
    class: LGBModel
    module_path: qlib.contrib.model.gbdt
    kwargs:
        loss: mse
        colsample_bytree: 0.8879
        learning_rate: 0.2
        subsample: 0.8789
        lambda_l1: 205.6999
        lambda_l2: 580.9768
        max_depth: 8
        num_leaves: 210
        num_threads: 20
    """))

LinearModel

LinearModel()

LinearModel is a wrapper model for Linear model.

import finlab.ml.qlib as q

# build X_train, y_train, X_test

model = q.LinearModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Source code in finlab/ml/qlib.py

def LinearModel():
    """
    LinearModel is a wrapper model for Linear model.
    ```py
    import finlab.ml.qlib as q

    # build X_train, y_train, X_test

    model = q.LinearModel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ```
    """
    return WrapperModel(yaml.safe_load("""
class: LinearModel
module_path: qlib.contrib.model.linear
kwargs:
    estimator: ols
"""))

SFMModel

SFMModel()

SFMModel is a wrapper model for SFM.

import finlab.ml.qlib as q

# build X_train, y_train, X_test

model = q.SFMModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Source code in finlab/ml/qlib.py

def SFMModel():
    """
    SFMModel is a wrapper model for SFM.
    ```py
    import finlab.ml.qlib as q

    # build X_train, y_train, X_test

    model = q.SFMModel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ```
    """
    return WrapperModel(yaml.safe_load("""
class: SFM
module_path: qlib.contrib.model.pytorch_sfm
kwargs:
    d_feat: 6
    hidden_size: 64
    output_dim: 32
    freq_dim: 25
    dropout_W: 0.5
    dropout_U: 0.5
    n_epochs: 20
    lr: 0.001
    batch_size: 1600
    early_stop: 20
    eval_steps: 5
    loss: mse
    optimizer: adam
    GPU: 0
"""))

TabnetModel

TabnetModel()

TabnetModel is a wrapper model for Tabnet model.

import finlab.ml.qlib as q

# build X_train, y_train, X_test

model = q.TabnetModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Source code in finlab/ml/qlib.py

def TabnetModel():
    """
    TabnetModel is a wrapper model for Tabnet model.
    ```py
    import finlab.ml.qlib as q

    # build X_train, y_train, X_test

    model = q.TabnetModel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ```
    """
    return WrapperModel(yaml.safe_load("""
class: TabnetModel
module_path: qlib.contrib.model.pytorch_tabnet
kwargs:
    d_feat: 8
    pretrain: False
    seed: 993
"""))

XGBModel

XGBModel()

XGBModel is a wrapper model for XGBoost model.

import finlab.ml.qlib as q

# build X_train, y_train, X_test

model = q.XGBModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Source code in finlab/ml/qlib.py

def XGBModel():
    """
    XGBModel is a wrapper model for XGBoost model.
    ```py
    import finlab.ml.qlib as q

    # build X_train, y_train, X_test

    model = q.XGBModel()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ```
    """
    return WrapperModel(yaml.safe_load("""
class: XGBModel
module_path: qlib.contrib.model.xgboost
kwargs:
    eval_metric: rmse
    colsample_bytree: 0.8879
    eta: 0.0421
    max_depth: 8
    n_estimators: 647
    subsample: 0.8789
    nthread: 20
"""))

alpha

alpha(handler='Alpha158', **kwargs)

產生 Qlib 的特徵 Args: handler (str): 預設為 'alpha158' 也可以設定成 'Alpha360' Examples:

import finlab.ml.qlib as q
features = q.alpha('Alpha158')

Source code in finlab/ml/qlib.py

def alpha(handler='Alpha158', **kwargs):

    """產生 Qlib 的特徵
    Args:
        handler (str): 預設為 'alpha158' 也可以設定成 'Alpha360'
    Examples:
        ```py
        import finlab.ml.qlib as q
        features = q.alpha('Alpha158')
        ```
    """
    init()

    if handler == 'Alpha158':
        h = Alpha158(instruments=D.instruments(market='all'), **kwargs)
    elif handler == 'Alpha360':
        h = Alpha360(instruments=D.instruments(market='all'), **kwargs)
    else:
        raise Exception(f"Handler {handler} not supported.")

    alpha = h.fetch(col_set="feature")
    return alpha

dump

dump(freq='day')

產生Qlib 於台股的資料庫 Examples:

import qlib
import finlab.ml.qlib as q

q.dump() # generate tw stock database
q.init() # initiate tw stock to perform machine leraning tasks (similar to qlib.init)

import qlib
# qlib functions and operations

Source code in finlab/ml/qlib.py

def dump(freq='day'):
    """產生Qlib 於台股的資料庫
    Examples:
        ```py
        import qlib
        import finlab.ml.qlib as q

        q.dump() # generate tw stock database
        q.init() # initiate tw stock to perform machine leraning tasks (similar to qlib.init)

        import qlib
        # qlib functions and operations
        ```
    """

    market = ml.get_market()
    region = get_region(market)

    csv_path = f'~/.qlib/csv_data/{region}_data'
    qlib_dir = f'~/.qlib/qlib_data/{region}_data'
    include_fields = "open,close,high,low,volume,factor"

    if not Path(csv_path).expanduser().exists():
        Path(csv_path).expanduser().mkdir(parents=True)
    if not Path(qlib_dir).expanduser().exists():
        Path(qlib_dir).expanduser().mkdir(parents=True)

    c = market.get_price('close', adj=False)
    ac = market.get_price('close', adj=True)
    o = market.get_price('open', adj=False)
    h = market.get_price('high', adj=False)
    l = market.get_price('low', adj=False)
    v = market.get_price('volume', adj=False)

    assert c is not None
    assert ac is not None
    assert o is not None
    assert h is not None
    assert l is not None
    assert v is not None

    for s in c.columns:
        pd.DataFrame({
            'date':c.index.values,
            'volume': v[s].values,
            'high': h[s].values,
            'low': l[s].values,
            'close': c[s].values,
            'open': o[s].values,
            'factor': ac[s].values / c[s].values,
            'symbol': s
            }).to_csv(Path(csv_path).expanduser() / f"{s}.csv")

    dumper = DumpDataAll(csv_path, qlib_dir, include_fields=include_fields, freq=freq)
    dumper()

get_models

get_models()

Return a list of available models. Examples:

import finlab.ml.qlib as q

models = q.get_models()
print(models)

output:

{ 'LGBModel': LGBModel, 'XGBModel': XGBModel, 'DEnsmbleModel': DEnsmbleModel, 'CatBoostModel': CatBoostModel, 'LinearModel': LinearModel, 'TabnetModel': TabnetModel, 'DNNModel': DNNModel, 'SFMModel': SFMModel}

Source code in finlab/ml/qlib.py

def get_models():
    """Return a list of available models.
    Examples:
        ```py
        import finlab.ml.qlib as q

        models = q.get_models()
        print(models)
        ```
        output:

        { 'LGBModel': LGBModel, 'XGBModel': XGBModel, 'DEnsmbleModel': DEnsmbleModel, 'CatBoostModel': CatBoostModel, 'LinearModel': LinearModel, 'TabnetModel': TabnetModel, 'DNNModel': DNNModel, 'SFMModel': SFMModel}

    """
    return {
        'LGBModel': LGBModel,
        'XGBModel': XGBModel,
        'DEnsmbleModel': DEnsmbleModel,
        'CatBoostModel': CatBoostModel,
        'LinearModel': LinearModel,
        'TabnetModel': TabnetModel,
        'DNNModel': DNNModel,
    }

init

init()

Qlib 初始化 (類似於台股版 qlib.init() 但更簡單易用) Examples:

import qlib
import finlab.ml.qlib as q

q.dump() # generate tw stock database
q.init() # initiate tw stock to perform machine leraning tasks (similar to qlib.init)

import qlib
# qlib functions and operations

Source code in finlab/ml/qlib.py

def init():
    """Qlib 初始化 (類似於台股版 qlib.init() 但更簡單易用)
    Examples:
        ```py
        import qlib
        import finlab.ml.qlib as q

        q.dump() # generate tw stock database
        q.init() # initiate tw stock to perform machine leraning tasks (similar to qlib.init)

        import qlib
        # qlib functions and operations
        ```
    """
    region = get_region(ml.get_market())
    try:
        from qlib import config
        config._default_region_config[region] = \
                dict(trade_unit=1000, limit_threshold=0.1, deal_price='close')
    except:
        pass

    global qlib_initialized

    if not qlib_initialized:
        qlib.init(provider_uri=f'~/.qlib/qlib_data/{region}_data', 
                  region=region)
        qlib_initialized = True

finlab.ml.alphalens

create_factor_data

create_factor_data(factor, adj_close, days=[5, 10, 20, 60])

create factor data, which contains future return

PARAMETER	DESCRIPTION
`factor`	factor data where index is datetime and columns is asset id TYPE: `DataFrame`
`adj_close`	adj close where index is datetime and columns is asset id TYPE: `DataFrame`
`days`	future return considered TYPE: `List[int]` DEFAULT: `[5, 10, 20, 60]`

Return

Analytic plots and tables

Examples:

股價淨值比分析

import alphalens
from finlab import data
from finlab.ml.alphalens import create_factor_data

factor = data.get('price_earning_ratio:股價淨值比')
adj_close = data.get('etl:adj_close')

factor_data = create_factor_data(factor, adj_close)

alphalens.tears.create_full_tear_sheet(factor_data.dropna(), long_short=False,
                                       group_neutral=False, by_group=False)

Source code in finlab/ml/alphalens.py

def create_factor_data(factor:pd.DataFrame, adj_close:pd.DataFrame, 
                       days:List[int]=[5,10,20, 60]):

    '''create factor data, which contains future return

    Args:
        factor (pd.DataFrame): factor data where index is datetime and columns is asset id
        adj_close (pd.DataFrame): adj close where index is datetime and columns is asset id
        days (List[int]): future return considered

    Return:
        Analytic plots and tables

    Examples:
        ``` py title="股價淨值比分析"
        import alphalens
        from finlab import data
        from finlab.ml.alphalens import create_factor_data

        factor = data.get('price_earning_ratio:股價淨值比')
        adj_close = data.get('etl:adj_close')

        factor_data = create_factor_data(factor, adj_close)

        alphalens.tears.create_full_tear_sheet(factor_data.dropna(), long_short=False,
                                               group_neutral=False, by_group=False)

        ```

    '''

    adj_close = adj_close.loc[factor.index[0]:factor.index[-1]]
    factor = factor.reindex(adj_close.index, method='ffill').loc[factor.index[0]:factor.index[-1]]

    sids = adj_close.columns.intersection(factor.columns)
    adj_close = adj_close[sids]
    factor = factor[sids]

    ret = {}
    ret['factor'] = factor.unstack().values
    ret['factor_quantile'] = (factor.rank(axis=1, pct=True) // 0.2).unstack().values

    total_index = None

    for d in days:
        temp = (adj_close.shift(-d-1) / adj_close.shift(-1) - 1).unstack()
        ret[f"{d}D"] = temp.values
        total_index = temp.index
    ret = pd.DataFrame(ret, index=total_index.swaplevel(0, 1))\
        .replace([-np.inf, np.inf], np.nan)\
        .dropna()
    ret.index.names = ['date', 'asset']
    return ret

factor_weights

factor_weights(factor_data, demeaned=True, group_adjust=False, equal_weight=False)

Computes asset weights by factor values and dividing by the sum of their absolute value (achieving gross leverage of 1). Positive factor values will results in positive weights and negative values in negative weights.

PARAMETER	DESCRIPTION
`factor_data`	A MultiIndex DataFrame indexed by date (level 0) and asset (level 1), containing the values for a single alpha factor, forward returns for each period, the factor quantile/bin that factor value belongs to, and (optionally) the group the asset belongs to. - See full explanation in utils.get_clean_factor_and_forward_returns TYPE: `DataFrame - MultiIndex`
`demeaned`	Should this computation happen on a long short portfolio? if True, weights are computed by demeaning factor values and dividing by the sum of their absolute value (achieving gross leverage of 1). The sum of positive weights will be the same as the negative weights (absolute value), suitable for a dollar neutral long-short portfolio TYPE: `bool` DEFAULT: `True`
`group_adjust`	Should this computation happen on a group neutral portfolio? If True, compute group neutral weights: each group will weight the same and if 'demeaned' is enabled the factor values demeaning will occur on the group level. TYPE: `bool` DEFAULT: `False`
`equal_weight`	if True the assets will be equal-weighted instead of factor-weighted If demeaned is True then the factor universe will be split in two equal sized groups, top assets with positive weights and bottom assets with negative weights TYPE: `bool` DEFAULT: `False`

RETURNS	DESCRIPTION
`returns`	pd.Series Assets weighted by factor value.

Source code in finlab/ml/alphalens.py

def factor_weights(factor_data,
                   demeaned=True,
                   group_adjust=False,
                   equal_weight=False):
    """
    Computes asset weights by factor values and dividing by the sum of their
    absolute value (achieving gross leverage of 1). Positive factor values will
    results in positive weights and negative values in negative weights.

    Args:
        factor_data (pd.DataFrame - MultiIndex):
            A MultiIndex DataFrame indexed by date (level 0) and asset (level 1),
            containing the values for a single alpha factor, forward returns for
            each period, the factor quantile/bin that factor value belongs to, and
            (optionally) the group the asset belongs to.
            - See full explanation in utils.get_clean_factor_and_forward_returns
        demeaned (bool):
            Should this computation happen on a long short portfolio? if True,
            weights are computed by demeaning factor values and dividing by the sum
            of their absolute value (achieving gross leverage of 1). The sum of
            positive weights will be the same as the negative weights (absolute
            value), suitable for a dollar neutral long-short portfolio
        group_adjust (bool):
            Should this computation happen on a group neutral portfolio? If True,
            compute group neutral weights: each group will weight the same and
            if 'demeaned' is enabled the factor values demeaning will occur on the
            group level.
        equal_weight (bool, optional):
            if True the assets will be equal-weighted instead of factor-weighted
            If demeaned is True then the factor universe will be split in two
            equal sized groups, top assets with positive weights and bottom assets
            with negative weights

    Returns:
        returns : pd.Series
            Assets weighted by factor value.
    """

    def to_weights(group, _demeaned, _equal_weight):

        if _equal_weight:
            group = group.copy()

            if _demeaned:
                # top assets positive weights, bottom ones negative
                group = group - group.median()

            negative_mask = group < 0
            group[negative_mask] = -1.0
            positive_mask = group > 0
            group[positive_mask] = 1.0

            if _demeaned:
                # positive weights must equal negative weights
                if negative_mask.any():
                    group[negative_mask] /= negative_mask.sum()
                if positive_mask.any():
                    group[positive_mask] /= positive_mask.sum()

        elif _demeaned:
            group = group - group.mean()

        return group / group.abs().sum()

    grouper = ['date']
    if group_adjust:
        grouper.append('group')

    weights = factor_data.groupby(grouper, group_keys=False)['factor'] \
        .apply(to_weights, demeaned, equal_weight)

    if group_adjust:
        weights = weights.groupby(level='date', group_keys=False).apply(to_weights, False, False)

    return weights