diff --git a/LEGAL.md b/LEGAL.md new file mode 100644 index 0000000..f968920 --- /dev/null +++ b/LEGAL.md @@ -0,0 +1,7 @@ +Legal Disclaimer + +Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail. + +法律免责声明 + +关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。 \ No newline at end of file diff --git a/README.md b/README.md index 60dc8f8..31ddd22 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,120 @@ -# Time-LLM -Official implementation of "Time-LLM: Time Series Forecasting by Reprogramming Large Language Models" +
+ + + +
+ +--- +> +> 🙋 Please let us know if you find out a mistake or have any suggestions! +> +> 🌟 If you find this resource helpful, please consider to star this repository and cite our research: + +``` +@inproceedings{jin2023time, + title={Time-llm: Time series forecasting by reprogramming large language models}, + author={Jin, Ming and Wang, Shiyu and Ma, Lintao and Chu, Zhixuan and Zhang, James Y and Shi, Xiaoming and Chen, Pin-Yu and Liang, Yuxuan and Li, Yuan-Fang and Pan, Shirui and others}, + booktitle={International Conference on Learning Representations}, + year={2024} +} +``` + +## Introdcution +Time-LLM is a reprogramming framework to repurpose LLMs for general time series forecasting with the backbone language models kept intact. +Notably, we show that time series analysis (e.g., forecasting) can be cast as yet another "language task" that can be effectively tackled by an off-the-shelf LLM. + ++ +
+ +- Time-LLM comprises two key components: (1) reprogramming the input time series into text prototype representations that are more natural for the LLM, and (2) augmenting the input context with declarative prompts (e.g., domain expert knowledge and task instructions) to guide LLM reasoning. + ++ +
+ +## Requirements +- accelerate==0.20.3 +- einops==0.7.0 +- matplotlib==3.7.0 +- numpy==1.23.5 +- pandas==1.5.3 +- scikit_learn==1.2.2 +- scipy==1.5.4 +- torch==2.0.1 +- tqdm==4.65.0 +- peft==0.4.0 +- transformers==4.31.0 +- deepspeed==0.13.0 + +To install all dependencies: +``` +pip install -r requirements.txt +``` + +## Datasets +You can access the well pre-processed datasets from [[Google Drive]](https://drive.google.com/file/d/1NF7VEefXCmXuWNbnNe858WvQAkJ_7wuP/view?usp=sharing), then place the downloaded contents under `./dataset` + +## Quick Demos +1. Download datasets and place them under `./dataset` +2. Tune the model. We provide five experiment scripts for demonstration purpose under the folder `./scripts`. For example, you can evaluate on ETT datasets by: + +```bash +bash ./scripts/TimeLLM_ETTh1.sh +``` +```bash +bash ./scripts/TimeLLM_ETTh2.sh +``` +```bash +bash ./scripts/TimeLLM_ETTm1.sh +``` +```bash +bash ./scripts/TimeLLM_ETTm2.sh +``` + +## Detailed usage + +Please refer to ```run_main.py``` and ```run_m4.py``` for the detailed description of each hyperparameter. + + +## Further Reading +[**Large Models for Time Series and Spatio-Temporal Data: A Survey and Outlook**](https://arxiv.org/abs/2310.10196) + +**Authors**: Ming Jin, Qingsong Wen*, Yuxuan Liang, Chaoli Zhang, Siqiao Xue, Xue Wang, James Zhang, Yi Wang, Haifeng Chen, Xiaoli Li (IEEE Fellow), Shirui Pan*, Vincent S. Tseng (IEEE Fellow), Yu Zheng (IEEE Fellow), Lei Chen (IEEE Fellow), Hui Xiong (IEEE Fellow) + +🌟 If you find this resource helpful, please consider to cite it in your research: + +``` +@article{jin2023lm4ts, + title={Large Models for Time Series and Spatio-Temporal Data: A Survey and Outlook}, + author={Ming Jin and Qingsong Wen and Yuxuan Liang and Chaoli Zhang and Siqiao Xue and Xue Wang and James Zhang and Yi Wang and Haifeng Chen and Xiaoli Li and Shirui Pan and Vincent S. Tseng and Yu Zheng and Lei Chen and Hui Xiong}, + journal={arXiv preprint arXiv:2310.10196}, + year={2023} +} +``` + +## Acknowledgement +Our implementation adapts [Time-Series-Library](https://github.com/thuml/Time-Series-Library) and [GPT4TS](https://github.com/DAMO-DI-ML/NeurIPS2023-One-Fits-All) as the code base and have extensively modified it to our purposes. We thank the authors for sharing their implementations and related resources. \ No newline at end of file diff --git a/data_provider/__init__.py b/data_provider/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/data_provider/__init__.py @@ -0,0 +1 @@ + diff --git a/data_provider/data_factory.py b/data_provider/data_factory.py new file mode 100644 index 0000000..1e51851 --- /dev/null +++ b/data_provider/data_factory.py @@ -0,0 +1,62 @@ +from data_provider.data_loader import Dataset_ETT_hour, Dataset_ETT_minute, Dataset_Custom, Dataset_M4 +from torch.utils.data import DataLoader + +data_dict = { + 'ETTh1': Dataset_ETT_hour, + 'ETTh2': Dataset_ETT_hour, + 'ETTm1': Dataset_ETT_minute, + 'ETTm2': Dataset_ETT_minute, + 'custom': Dataset_Custom, + 'm4': Dataset_M4, +} + + +def data_provider(args, flag): + Data = data_dict[args.data] + timeenc = 0 if args.embed != 'timeF' else 1 + percent = args.percent + + if flag == 'test': + shuffle_flag = False + drop_last = True + batch_size = args.batch_size + freq = args.freq + else: + shuffle_flag = True + drop_last = True + batch_size = args.batch_size + freq = args.freq + + if args.data == 'm4': + drop_last = False + data_set = Data( + root_path=args.root_path, + data_path=args.data_path, + flag=flag, + size=[args.seq_len, args.label_len, args.pred_len], + features=args.features, + target=args.target, + timeenc=timeenc, + freq=freq, + seasonal_patterns=args.seasonal_patterns + ) + else: + data_set = Data( + root_path=args.root_path, + data_path=args.data_path, + flag=flag, + size=[args.seq_len, args.label_len, args.pred_len], + features=args.features, + target=args.target, + timeenc=timeenc, + freq=freq, + percent=percent, + seasonal_patterns=args.seasonal_patterns + ) + data_loader = DataLoader( + data_set, + batch_size=batch_size, + shuffle=shuffle_flag, + num_workers=args.num_workers, + drop_last=drop_last) + return data_set, data_loader diff --git a/data_provider/data_loader.py b/data_provider/data_loader.py new file mode 100644 index 0000000..a9a1569 --- /dev/null +++ b/data_provider/data_loader.py @@ -0,0 +1,389 @@ +import os +import numpy as np +import pandas as pd +from torch.utils.data import Dataset +from sklearn.preprocessing import StandardScaler +from utils.timefeatures import time_features +from data_provider.m4 import M4Dataset, M4Meta +import warnings + +warnings.filterwarnings('ignore') + + +class Dataset_ETT_hour(Dataset): + def __init__(self, root_path, flag='train', size=None, + features='S', data_path='ETTh1.csv', + target='OT', scale=True, timeenc=0, freq='h', percent=100, + seasonal_patterns=None): + if size == None: + self.seq_len = 24 * 4 * 4 + self.label_len = 24 * 4 + self.pred_len = 24 * 4 + else: + self.seq_len = size[0] + self.label_len = size[1] + self.pred_len = size[2] + # init + assert flag in ['train', 'test', 'val'] + type_map = {'train': 0, 'val': 1, 'test': 2} + self.set_type = type_map[flag] + + self.percent = percent + self.features = features + self.target = target + self.scale = scale + self.timeenc = timeenc + self.freq = freq + + # self.percent = percent + self.root_path = root_path + self.data_path = data_path + self.__read_data__() + + self.enc_in = self.data_x.shape[-1] + self.tot_len = len(self.data_x) - self.seq_len - self.pred_len + 1 + + def __read_data__(self): + self.scaler = StandardScaler() + df_raw = pd.read_csv(os.path.join(self.root_path, + self.data_path)) + + border1s = [0, 12 * 30 * 24 - self.seq_len, 12 * 30 * 24 + 4 * 30 * 24 - self.seq_len] + border2s = [12 * 30 * 24, 12 * 30 * 24 + 4 * 30 * 24, 12 * 30 * 24 + 8 * 30 * 24] + + border1 = border1s[self.set_type] + border2 = border2s[self.set_type] + + if self.set_type == 0: + border2 = (border2 - self.seq_len) * self.percent // 100 + self.seq_len + + if self.features == 'M' or self.features == 'MS': + cols_data = df_raw.columns[1:] + df_data = df_raw[cols_data] + elif self.features == 'S': + df_data = df_raw[[self.target]] + + if self.scale: + train_data = df_data[border1s[0]:border2s[0]] + self.scaler.fit(train_data.values) + data = self.scaler.transform(df_data.values) + else: + data = df_data.values + + df_stamp = df_raw[['date']][border1:border2] + df_stamp['date'] = pd.to_datetime(df_stamp.date) + if self.timeenc == 0: + df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1) + df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1) + df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) + df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1) + data_stamp = df_stamp.drop(['date'], 1).values + elif self.timeenc == 1: + data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) + data_stamp = data_stamp.transpose(1, 0) + + self.data_x = data[border1:border2] + self.data_y = data[border1:border2] + self.data_stamp = data_stamp + + + def __getitem__(self, index): + feat_id = index // self.tot_len + s_begin = index % self.tot_len + + s_end = s_begin + self.seq_len + r_begin = s_end - self.label_len + r_end = r_begin + self.label_len + self.pred_len + seq_x = self.data_x[s_begin:s_end, feat_id:feat_id + 1] + seq_y = self.data_y[r_begin:r_end, feat_id:feat_id + 1] + seq_x_mark = self.data_stamp[s_begin:s_end] + seq_y_mark = self.data_stamp[r_begin:r_end] + + return seq_x, seq_y, seq_x_mark, seq_y_mark + + def __len__(self): + return (len(self.data_x) - self.seq_len - self.pred_len + 1) * self.enc_in + + def inverse_transform(self, data): + return self.scaler.inverse_transform(data) + + +class Dataset_ETT_minute(Dataset): + def __init__(self, root_path, flag='train', size=None, + features='S', data_path='ETTm1.csv', + target='OT', scale=True, timeenc=0, freq='t', percent=100, + seasonal_patterns=None): + if size == None: + self.seq_len = 24 * 4 * 4 + self.label_len = 24 * 4 + self.pred_len = 24 * 4 + else: + self.seq_len = size[0] + self.label_len = size[1] + self.pred_len = size[2] + # init + assert flag in ['train', 'test', 'val'] + type_map = {'train': 0, 'val': 1, 'test': 2} + self.set_type = type_map[flag] + + self.percent = percent + self.features = features + self.target = target + self.scale = scale + self.timeenc = timeenc + self.freq = freq + + self.root_path = root_path + self.data_path = data_path + self.__read_data__() + + self.enc_in = self.data_x.shape[-1] + self.tot_len = len(self.data_x) - self.seq_len - self.pred_len + 1 + + def __read_data__(self): + self.scaler = StandardScaler() + df_raw = pd.read_csv(os.path.join(self.root_path, + self.data_path)) + + border1s = [0, 12 * 30 * 24 * 4 - self.seq_len, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4 - self.seq_len] + border2s = [12 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 8 * 30 * 24 * 4] + + border1 = border1s[self.set_type] + border2 = border2s[self.set_type] + + if self.set_type == 0: + border2 = (border2 - self.seq_len) * self.percent // 100 + self.seq_len + + if self.features == 'M' or self.features == 'MS': + cols_data = df_raw.columns[1:] + df_data = df_raw[cols_data] + elif self.features == 'S': + df_data = df_raw[[self.target]] + + if self.scale: + train_data = df_data[border1s[0]:border2s[0]] + self.scaler.fit(train_data.values) + data = self.scaler.transform(df_data.values) + else: + data = df_data.values + + df_stamp = df_raw[['date']][border1:border2] + df_stamp['date'] = pd.to_datetime(df_stamp.date) + if self.timeenc == 0: + df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1) + df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1) + df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) + df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1) + df_stamp['minute'] = df_stamp.date.apply(lambda row: row.minute, 1) + df_stamp['minute'] = df_stamp.minute.map(lambda x: x // 15) + data_stamp = df_stamp.drop(['date'], 1).values + elif self.timeenc == 1: + data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) + data_stamp = data_stamp.transpose(1, 0) + + self.data_x = data[border1:border2] + self.data_y = data[border1:border2] + self.data_stamp = data_stamp + + def __getitem__(self, index): + feat_id = index // self.tot_len + s_begin = index % self.tot_len + + s_end = s_begin + self.seq_len + r_begin = s_end - self.label_len + r_end = r_begin + self.label_len + self.pred_len + seq_x = self.data_x[s_begin:s_end, feat_id:feat_id + 1] + seq_y = self.data_y[r_begin:r_end, feat_id:feat_id + 1] + seq_x_mark = self.data_stamp[s_begin:s_end] + seq_y_mark = self.data_stamp[r_begin:r_end] + + return seq_x, seq_y, seq_x_mark, seq_y_mark + + def __len__(self): + return (len(self.data_x) - self.seq_len - self.pred_len + 1) * self.enc_in + + def inverse_transform(self, data): + return self.scaler.inverse_transform(data) + + +class Dataset_Custom(Dataset): + def __init__(self, root_path, flag='train', size=None, + features='S', data_path='ETTh1.csv', + target='OT', scale=True, timeenc=0, freq='h', percent=100, + seasonal_patterns=None): + if size == None: + self.seq_len = 24 * 4 * 4 + self.label_len = 24 * 4 + self.pred_len = 24 * 4 + else: + self.seq_len = size[0] + self.label_len = size[1] + self.pred_len = size[2] + # init + assert flag in ['train', 'test', 'val'] + type_map = {'train': 0, 'val': 1, 'test': 2} + self.set_type = type_map[flag] + + self.features = features + self.target = target + self.scale = scale + self.timeenc = timeenc + self.freq = freq + self.percent = percent + + self.root_path = root_path + self.data_path = data_path + self.__read_data__() + + self.enc_in = self.data_x.shape[-1] + self.tot_len = len(self.data_x) - self.seq_len - self.pred_len + 1 + + def __read_data__(self): + self.scaler = StandardScaler() + df_raw = pd.read_csv(os.path.join(self.root_path, + self.data_path)) + + ''' + df_raw.columns: ['date', ...(other features), target feature] + ''' + cols = list(df_raw.columns) + cols.remove(self.target) + cols.remove('date') + df_raw = df_raw[['date'] + cols + [self.target]] + num_train = int(len(df_raw) * 0.7) + num_test = int(len(df_raw) * 0.2) + num_vali = len(df_raw) - num_train - num_test + border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len] + border2s = [num_train, num_train + num_vali, len(df_raw)] + border1 = border1s[self.set_type] + border2 = border2s[self.set_type] + + if self.set_type == 0: + border2 = (border2 - self.seq_len) * self.percent // 100 + self.seq_len + + if self.features == 'M' or self.features == 'MS': + cols_data = df_raw.columns[1:] + df_data = df_raw[cols_data] + elif self.features == 'S': + df_data = df_raw[[self.target]] + + if self.scale: + train_data = df_data[border1s[0]:border2s[0]] + self.scaler.fit(train_data.values) + data = self.scaler.transform(df_data.values) + else: + data = df_data.values + + df_stamp = df_raw[['date']][border1:border2] + df_stamp['date'] = pd.to_datetime(df_stamp.date) + if self.timeenc == 0: + df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1) + df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1) + df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) + df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1) + data_stamp = df_stamp.drop(['date'], 1).values + elif self.timeenc == 1: + data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) + data_stamp = data_stamp.transpose(1, 0) + + self.data_x = data[border1:border2] + self.data_y = data[border1:border2] + self.data_stamp = data_stamp + + def __getitem__(self, index): + feat_id = index // self.tot_len + s_begin = index % self.tot_len + + s_end = s_begin + self.seq_len + r_begin = s_end - self.label_len + r_end = r_begin + self.label_len + self.pred_len + seq_x = self.data_x[s_begin:s_end, feat_id:feat_id + 1] + seq_y = self.data_y[r_begin:r_end, feat_id:feat_id + 1] + seq_x_mark = self.data_stamp[s_begin:s_end] + seq_y_mark = self.data_stamp[r_begin:r_end] + + return seq_x, seq_y, seq_x_mark, seq_y_mark + + def __len__(self): + return (len(self.data_x) - self.seq_len - self.pred_len + 1) * self.enc_in + + def inverse_transform(self, data): + return self.scaler.inverse_transform(data) + + +class Dataset_M4(Dataset): + def __init__(self, root_path, flag='pred', size=None, + features='S', data_path='ETTh1.csv', + target='OT', scale=False, inverse=False, timeenc=0, freq='15min', + seasonal_patterns='Yearly'): + self.features = features + self.target = target + self.scale = scale + self.inverse = inverse + self.timeenc = timeenc + self.root_path = root_path + + self.seq_len = size[0] + self.label_len = size[1] + self.pred_len = size[2] + + self.seasonal_patterns = seasonal_patterns + self.history_size = M4Meta.history_size[seasonal_patterns] + self.window_sampling_limit = int(self.history_size * self.pred_len) + self.flag = flag + + self.__read_data__() + + def __read_data__(self): + # M4Dataset.initialize() + if self.flag == 'train': + dataset = M4Dataset.load(training=True, dataset_file=self.root_path) + else: + dataset = M4Dataset.load(training=False, dataset_file=self.root_path) + training_values = np.array( + [v[~np.isnan(v)] for v in + dataset.values[dataset.groups == self.seasonal_patterns]]) # split different frequencies + self.ids = np.array([i for i in dataset.ids[dataset.groups == self.seasonal_patterns]]) + self.timeseries = [ts for ts in training_values] + + def __getitem__(self, index): + insample = np.zeros((self.seq_len, 1)) + insample_mask = np.zeros((self.seq_len, 1)) + outsample = np.zeros((self.pred_len + self.label_len, 1)) + outsample_mask = np.zeros((self.pred_len + self.label_len, 1)) # m4 dataset + + sampled_timeseries = self.timeseries[index] + cut_point = np.random.randint(low=max(1, len(sampled_timeseries) - self.window_sampling_limit), + high=len(sampled_timeseries), + size=1)[0] + + insample_window = sampled_timeseries[max(0, cut_point - self.seq_len):cut_point] + insample[-len(insample_window):, 0] = insample_window + insample_mask[-len(insample_window):, 0] = 1.0 + outsample_window = sampled_timeseries[ + cut_point - self.label_len:min(len(sampled_timeseries), cut_point + self.pred_len)] + outsample[:len(outsample_window), 0] = outsample_window + outsample_mask[:len(outsample_window), 0] = 1.0 + return insample, outsample, insample_mask, outsample_mask + + def __len__(self): + return len(self.timeseries) + + def inverse_transform(self, data): + return self.scaler.inverse_transform(data) + + def last_insample_window(self): + """ + The last window of insample size of all timeseries. + This function does not support batching and does not reshuffle timeseries. + + :return: Last insample window of all timeseries. Shape "timeseries, insample size" + """ + insample = np.zeros((len(self.timeseries), self.seq_len)) + insample_mask = np.zeros((len(self.timeseries), self.seq_len)) + for i, ts in enumerate(self.timeseries): + ts_last_window = ts[-self.seq_len:] + insample[i, -len(ts):] = ts_last_window + insample_mask[i, -len(ts):] = 1.0 + return insample, insample_mask + diff --git a/data_provider/m4.py b/data_provider/m4.py new file mode 100644 index 0000000..a541851 --- /dev/null +++ b/data_provider/m4.py @@ -0,0 +1,132 @@ +# This source code is provided for the purposes of scientific reproducibility +# under the following limited license from Element AI Inc. The code is an +# implementation of the N-BEATS model (Oreshkin et al., N-BEATS: Neural basis +# expansion analysis for interpretable time series forecasting, +# https://arxiv.org/abs/1905.10437). The copyright to the source code is +# licensed under the Creative Commons - Attribution-NonCommercial 4.0 +# International license (CC BY-NC 4.0): +# https://creativecommons.org/licenses/by-nc/4.0/. Any commercial use (whether +# for the benefit of third parties or internally in production) requires an +# explicit license. The subject-matter of the N-BEATS model and associated +# materials are the property of Element AI Inc. and may be subject to patent +# protection. No license to patents is granted hereunder (whether express or +# implied). Copyright © 2020 Element AI Inc. All rights reserved. + +""" +M4 Dataset +""" +from dataclasses import dataclass + +import numpy as np +import pandas as pd +import logging +import os +import pathlib +import sys +from urllib import request + + +def url_file_name(url: str) -> str: + """ + Extract file name from url. + + :param url: URL to extract file name from. + :return: File name. + """ + return url.split('/')[-1] if len(url) > 0 else '' + + +def download(url: str, file_path: str) -> None: + """ + Download a file to the given path. + + :param url: URL to download + :param file_path: Where to download the content. + """ + + def progress(count, block_size, total_size): + progress_pct = float(count * block_size) / float(total_size) * 100.0 + sys.stdout.write('\rDownloading {} to {} {:.1f}%'.format(url, file_path, progress_pct)) + sys.stdout.flush() + + if not os.path.isfile(file_path): + opener = request.build_opener() + opener.addheaders = [('User-agent', 'Mozilla/5.0')] + request.install_opener(opener) + pathlib.Path(os.path.dirname(file_path)).mkdir(parents=True, exist_ok=True) + f, _ = request.urlretrieve(url, file_path, progress) + sys.stdout.write('\n') + sys.stdout.flush() + file_info = os.stat(f) + logging.info(f'Successfully downloaded {os.path.basename(file_path)} {file_info.st_size} bytes.') + else: + file_info = os.stat(file_path) + logging.info(f'File already exists: {file_path} {file_info.st_size} bytes.') + + +@dataclass() +class M4Dataset: + ids: np.ndarray + groups: np.ndarray + frequencies: np.ndarray + horizons: np.ndarray + values: np.ndarray + + @staticmethod + def load(training: bool = True, dataset_file: str = '../dataset/m4') -> 'M4Dataset': + """ + Load cached dataset. + + :param training: Load training part if training is True, test part otherwise. + """ + info_file = os.path.join(dataset_file, 'M4-info.csv') + train_cache_file = os.path.join(dataset_file, 'training.npz') + test_cache_file = os.path.join(dataset_file, 'test.npz') + m4_info = pd.read_csv(info_file) + return M4Dataset(ids=m4_info.M4id.values, + groups=m4_info.SP.values, + frequencies=m4_info.Frequency.values, + horizons=m4_info.Horizon.values, + values=np.load( + train_cache_file if training else test_cache_file, + allow_pickle=True)) + + +@dataclass() +class M4Meta: + seasonal_patterns = ['Yearly', 'Quarterly', 'Monthly', 'Weekly', 'Daily', 'Hourly'] + horizons = [6, 8, 18, 13, 14, 48] + frequencies = [1, 4, 12, 1, 1, 24] + horizons_map = { + 'Yearly': 6, + 'Quarterly': 8, + 'Monthly': 18, + 'Weekly': 13, + 'Daily': 14, + 'Hourly': 48 + } # different predict length + frequency_map = { + 'Yearly': 1, + 'Quarterly': 4, + 'Monthly': 12, + 'Weekly': 1, + 'Daily': 1, + 'Hourly': 24 + } + history_size = { + 'Yearly': 1.5, + 'Quarterly': 1.5, + 'Monthly': 1.5, + 'Weekly': 10, + 'Daily': 10, + 'Hourly': 10 + } # from interpretable.gin + + +def load_m4_info() -> pd.DataFrame: + """ + Load M4Info file. + + :return: Pandas DataFrame of M4Info. + """ + return pd.read_csv(INFO_FILE_PATH) diff --git a/dataset/prompt_bank/ETT.txt b/dataset/prompt_bank/ETT.txt new file mode 100644 index 0000000..f565c11 --- /dev/null +++ b/dataset/prompt_bank/ETT.txt @@ -0,0 +1,2 @@ +The Electricity Transformer Temperature (ETT) is a crucial indicator in the electric power long-term deployment. This dataset consists of 2 years data from two separated counties in China. To explore the granularity on the Long sequence time-series forecasting (LSTF) problem, different subsets are created, {ETTh1, ETTh2} for 1-hour-level and ETTm1 for 15-minutes-level. Each data point consists of the target value ”oil temperature” and 6 power load features. The train/val/test is 12/4/4 months. + diff --git a/dataset/prompt_bank/m4.txt b/dataset/prompt_bank/m4.txt new file mode 100644 index 0000000..ee27c76 --- /dev/null +++ b/dataset/prompt_bank/m4.txt @@ -0,0 +1,2 @@ +The M4 dataset is a collection of 100,000 time series used for the fourth edition of the Makridakis forecasting Competition. The M4 dataset consists of time series of yearly, quarterly, monthly and other (weekly, daily and hourly) data, which are divided into training and test sets. The minimum numbers of observations in the training test are 13 for yearly, 16 for quarterly, 42 for monthly, 80 for weekly, 93 for daily and 700 for hourly series. The participants were asked to produce the following numbers of forecasts beyond the available data that they had been given: six for yearly, eight for quarterly, 18 for monthly series, 13 for weekly series and 14 and 48 forecasts respectively for the daily and hourly ones. + diff --git a/ds_config_zero2.json b/ds_config_zero2.json new file mode 100644 index 0000000..aa37aae --- /dev/null +++ b/ds_config_zero2.json @@ -0,0 +1,21 @@ +{ + "bf16": { + "enabled": true, + "auto_cast": true + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "sub_group_size": 1e9 + }, + "gradient_accumulation_steps": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "steps_per_print": 10, + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/figures/framework.png b/figures/framework.png new file mode 100644 index 0000000..17c27b4 Binary files /dev/null and b/figures/framework.png differ diff --git a/figures/logo.png b/figures/logo.png new file mode 100644 index 0000000..befdabc Binary files /dev/null and b/figures/logo.png differ diff --git a/figures/method-detailed-illustration.png b/figures/method-detailed-illustration.png new file mode 100644 index 0000000..62badae Binary files /dev/null and b/figures/method-detailed-illustration.png differ diff --git a/layers/AutoCorrelation.py b/layers/AutoCorrelation.py new file mode 100644 index 0000000..5faec7c --- /dev/null +++ b/layers/AutoCorrelation.py @@ -0,0 +1,163 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import matplotlib.pyplot as plt +import numpy as np +import math +from math import sqrt +import os + + +class AutoCorrelation(nn.Module): + """ + AutoCorrelation Mechanism with the following two phases: + (1) period-based dependencies discovery + (2) time delay aggregation + This block can replace the self-attention family mechanism seamlessly. + """ + + def __init__(self, mask_flag=True, factor=1, scale=None, attention_dropout=0.1, output_attention=False): + super(AutoCorrelation, self).__init__() + self.factor = factor + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def time_delay_agg_training(self, values, corr): + """ + SpeedUp version of Autocorrelation (a batch-normalization style design) + This is for the training phase. + """ + head = values.shape[1] + channel = values.shape[2] + length = values.shape[3] + # find top k + top_k = int(self.factor * math.log(length)) + mean_value = torch.mean(torch.mean(corr, dim=1), dim=1) + index = torch.topk(torch.mean(mean_value, dim=0), top_k, dim=-1)[1] + weights = torch.stack([mean_value[:, index[i]] for i in range(top_k)], dim=-1) + # update corr + tmp_corr = torch.softmax(weights, dim=-1) + # aggregation + tmp_values = values + delays_agg = torch.zeros_like(values).float() + for i in range(top_k): + pattern = torch.roll(tmp_values, -int(index[i]), -1) + delays_agg = delays_agg + pattern * \ + (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length)) + return delays_agg + + def time_delay_agg_inference(self, values, corr): + """ + SpeedUp version of Autocorrelation (a batch-normalization style design) + This is for the inference phase. + """ + batch = values.shape[0] + head = values.shape[1] + channel = values.shape[2] + length = values.shape[3] + # index init + init_index = torch.arange(length).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(batch, head, channel, 1).cuda() + # find top k + top_k = int(self.factor * math.log(length)) + mean_value = torch.mean(torch.mean(corr, dim=1), dim=1) + weights, delay = torch.topk(mean_value, top_k, dim=-1) + # update corr + tmp_corr = torch.softmax(weights, dim=-1) + # aggregation + tmp_values = values.repeat(1, 1, 1, 2) + delays_agg = torch.zeros_like(values).float() + for i in range(top_k): + tmp_delay = init_index + delay[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length) + pattern = torch.gather(tmp_values, dim=-1, index=tmp_delay) + delays_agg = delays_agg + pattern * \ + (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length)) + return delays_agg + + def time_delay_agg_full(self, values, corr): + """ + Standard version of Autocorrelation + """ + batch = values.shape[0] + head = values.shape[1] + channel = values.shape[2] + length = values.shape[3] + # index init + init_index = torch.arange(length).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(batch, head, channel, 1).cuda() + # find top k + top_k = int(self.factor * math.log(length)) + weights, delay = torch.topk(corr, top_k, dim=-1) + # update corr + tmp_corr = torch.softmax(weights, dim=-1) + # aggregation + tmp_values = values.repeat(1, 1, 1, 2) + delays_agg = torch.zeros_like(values).float() + for i in range(top_k): + tmp_delay = init_index + delay[..., i].unsqueeze(-1) + pattern = torch.gather(tmp_values, dim=-1, index=tmp_delay) + delays_agg = delays_agg + pattern * (tmp_corr[..., i].unsqueeze(-1)) + return delays_agg + + def forward(self, queries, keys, values, attn_mask): + B, L, H, E = queries.shape + _, S, _, D = values.shape + if L > S: + zeros = torch.zeros_like(queries[:, :(L - S), :]).float() + values = torch.cat([values, zeros], dim=1) + keys = torch.cat([keys, zeros], dim=1) + else: + values = values[:, :L, :, :] + keys = keys[:, :L, :, :] + + # period-based dependencies + q_fft = torch.fft.rfft(queries.permute(0, 2, 3, 1).contiguous(), dim=-1) + k_fft = torch.fft.rfft(keys.permute(0, 2, 3, 1).contiguous(), dim=-1) + res = q_fft * torch.conj(k_fft) + corr = torch.fft.irfft(res, dim=-1) + + # time delay agg + if self.training: + V = self.time_delay_agg_training(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2) + else: + V = self.time_delay_agg_inference(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2) + + if self.output_attention: + return (V.contiguous(), corr.permute(0, 3, 1, 2)) + else: + return (V.contiguous(), None) + + +class AutoCorrelationLayer(nn.Module): + def __init__(self, correlation, d_model, n_heads, d_keys=None, + d_values=None): + super(AutoCorrelationLayer, self).__init__() + + d_keys = d_keys or (d_model // n_heads) + d_values = d_values or (d_model // n_heads) + + self.inner_correlation = correlation + self.query_projection = nn.Linear(d_model, d_keys * n_heads) + self.key_projection = nn.Linear(d_model, d_keys * n_heads) + self.value_projection = nn.Linear(d_model, d_values * n_heads) + self.out_projection = nn.Linear(d_values * n_heads, d_model) + self.n_heads = n_heads + + def forward(self, queries, keys, values, attn_mask): + B, L, _ = queries.shape + _, S, _ = keys.shape + H = self.n_heads + + queries = self.query_projection(queries).view(B, L, H, -1) + keys = self.key_projection(keys).view(B, S, H, -1) + values = self.value_projection(values).view(B, S, H, -1) + + out, attn = self.inner_correlation( + queries, + keys, + values, + attn_mask + ) + out = out.view(B, L, -1) + + return self.out_projection(out), attn diff --git a/layers/Autoformer_EncDec.py b/layers/Autoformer_EncDec.py new file mode 100644 index 0000000..6fce4bc --- /dev/null +++ b/layers/Autoformer_EncDec.py @@ -0,0 +1,203 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class my_Layernorm(nn.Module): + """ + Special designed layernorm for the seasonal part + """ + + def __init__(self, channels): + super(my_Layernorm, self).__init__() + self.layernorm = nn.LayerNorm(channels) + + def forward(self, x): + x_hat = self.layernorm(x) + bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1) + return x_hat - bias + + +class moving_avg(nn.Module): + """ + Moving average block to highlight the trend of time series + """ + + def __init__(self, kernel_size, stride): + super(moving_avg, self).__init__() + self.kernel_size = kernel_size + self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0) + + def forward(self, x): + # padding on the both ends of time series + front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1) + end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1) + x = torch.cat([front, x, end], dim=1) + x = self.avg(x.permute(0, 2, 1)) + x = x.permute(0, 2, 1) + return x + + +class series_decomp(nn.Module): + """ + Series decomposition block + """ + + def __init__(self, kernel_size): + super(series_decomp, self).__init__() + self.moving_avg = moving_avg(kernel_size, stride=1) + + def forward(self, x): + moving_mean = self.moving_avg(x) + res = x - moving_mean + return res, moving_mean + + +class series_decomp_multi(nn.Module): + """ + Multiple Series decomposition block from FEDformer + """ + + def __init__(self, kernel_size): + super(series_decomp_multi, self).__init__() + self.kernel_size = kernel_size + self.series_decomp = [series_decomp(kernel) for kernel in kernel_size] + + def forward(self, x): + moving_mean = [] + res = [] + for func in self.series_decomp: + sea, moving_avg = func(x) + moving_mean.append(moving_avg) + res.append(sea) + + sea = sum(res) / len(res) + moving_mean = sum(moving_mean) / len(moving_mean) + return sea, moving_mean + + +class EncoderLayer(nn.Module): + """ + Autoformer encoder layer with the progressive decomposition architecture + """ + + def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, activation="relu"): + super(EncoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.attention = attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False) + self.decomp1 = series_decomp(moving_avg) + self.decomp2 = series_decomp(moving_avg) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, attn_mask=None): + new_x, attn = self.attention( + x, x, x, + attn_mask=attn_mask + ) + x = x + self.dropout(new_x) + x, _ = self.decomp1(x) + y = x + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + res, _ = self.decomp2(x + y) + return res, attn + + +class Encoder(nn.Module): + """ + Autoformer encoder + """ + + def __init__(self, attn_layers, conv_layers=None, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None + self.norm = norm_layer + + def forward(self, x, attn_mask=None): + attns = [] + if self.conv_layers is not None: + for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers): + x, attn = attn_layer(x, attn_mask=attn_mask) + x = conv_layer(x) + attns.append(attn) + x, attn = self.attn_layers[-1](x) + attns.append(attn) + else: + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask) + attns.append(attn) + + if self.norm is not None: + x = self.norm(x) + + return x, attns + + +class DecoderLayer(nn.Module): + """ + Autoformer decoder layer with the progressive decomposition architecture + """ + + def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None, + moving_avg=25, dropout=0.1, activation="relu"): + super(DecoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.self_attention = self_attention + self.cross_attention = cross_attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False) + self.decomp1 = series_decomp(moving_avg) + self.decomp2 = series_decomp(moving_avg) + self.decomp3 = series_decomp(moving_avg) + self.dropout = nn.Dropout(dropout) + self.projection = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=3, stride=1, padding=1, + padding_mode='circular', bias=False) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, cross, x_mask=None, cross_mask=None): + x = x + self.dropout(self.self_attention( + x, x, x, + attn_mask=x_mask + )[0]) + x, trend1 = self.decomp1(x) + x = x + self.dropout(self.cross_attention( + x, cross, cross, + attn_mask=cross_mask + )[0]) + x, trend2 = self.decomp2(x) + y = x + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + x, trend3 = self.decomp3(x + y) + + residual_trend = trend1 + trend2 + trend3 + residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(1, 2) + return x, residual_trend + + +class Decoder(nn.Module): + """ + Autoformer encoder + """ + + def __init__(self, layers, norm_layer=None, projection=None): + super(Decoder, self).__init__() + self.layers = nn.ModuleList(layers) + self.norm = norm_layer + self.projection = projection + + def forward(self, x, cross, x_mask=None, cross_mask=None, trend=None): + for layer in self.layers: + x, residual_trend = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask) + trend = trend + residual_trend + + if self.norm is not None: + x = self.norm(x) + + if self.projection is not None: + x = self.projection(x) + return x, trend diff --git a/layers/Conv_Blocks.py b/layers/Conv_Blocks.py new file mode 100644 index 0000000..96ce3fc --- /dev/null +++ b/layers/Conv_Blocks.py @@ -0,0 +1,60 @@ +import torch +import torch.nn as nn + + +class Inception_Block_V1(nn.Module): + def __init__(self, in_channels, out_channels, num_kernels=6, init_weight=True): + super(Inception_Block_V1, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.num_kernels = num_kernels + kernels = [] + for i in range(self.num_kernels): + kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=2 * i + 1, padding=i)) + self.kernels = nn.ModuleList(kernels) + if init_weight: + self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + res_list = [] + for i in range(self.num_kernels): + res_list.append(self.kernels[i](x)) + res = torch.stack(res_list, dim=-1).mean(-1) + return res + + +class Inception_Block_V2(nn.Module): + def __init__(self, in_channels, out_channels, num_kernels=6, init_weight=True): + super(Inception_Block_V2, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.num_kernels = num_kernels + kernels = [] + for i in range(self.num_kernels // 2): + kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=[1, 2 * i + 3], padding=[0, i + 1])) + kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=[2 * i + 3, 1], padding=[i + 1, 0])) + kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=1)) + self.kernels = nn.ModuleList(kernels) + if init_weight: + self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + res_list = [] + for i in range(self.num_kernels + 1): + res_list.append(self.kernels[i](x)) + res = torch.stack(res_list, dim=-1).mean(-1) + return res diff --git a/layers/Embed.py b/layers/Embed.py new file mode 100644 index 0000000..2b0f11b --- /dev/null +++ b/layers/Embed.py @@ -0,0 +1,198 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.nn.utils import weight_norm +import math + + +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + return self.pe[:, :x.size(1)] + + +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__ >= '1.5.0' else 2 + self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=padding, padding_mode='circular', bias=False) + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_( + m.weight, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) + return x + + +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() + + w = torch.zeros(c_in, d_model).float() + w.require_grad = False + + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) + + def forward(self, x): + return self.emb(x).detach() + + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type='fixed', freq='h'): + super(TemporalEmbedding, self).__init__() + + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding + if freq == 't': + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + minute_x = self.minute_embed(x[:, :, 4]) if hasattr( + self, 'minute_embed') else 0. + hour_x = self.hour_embed(x[:, :, 3]) + weekday_x = self.weekday_embed(x[:, :, 2]) + day_x = self.day_embed(x[:, :, 1]) + month_x = self.month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type='timeF', freq='h'): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {'h': 4, 't': 5, 's': 6, + 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model, bias=False) + + def forward(self, x): + return self.embed(x) + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + self.position_embedding(x).to(x.device) + else: + x = self.value_embedding( + x) + self.temporal_embedding(x_mark) + self.position_embedding(x) + return self.dropout(x) + + +class DataEmbedding_wo_pos(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_wo_pos, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + else: + x = self.value_embedding(x) + self.temporal_embedding(x_mark) + return self.dropout(x) + + +class ReplicationPad1d(nn.Module): + def __init__(self, padding) -> None: + super(ReplicationPad1d, self).__init__() + self.padding = padding + + def forward(self, input: Tensor) -> Tensor: + replicate_padding = input[:, :, -1].unsqueeze(-1).repeat(1, 1, self.padding[-1]) + output = torch.cat([input, replicate_padding], dim=-1) + return output + + +class PatchEmbedding(nn.Module): + def __init__(self, d_model, patch_len, stride, dropout): + super(PatchEmbedding, self).__init__() + # Patching + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = ReplicationPad1d((0, stride)) + + # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space + self.value_embedding = TokenEmbedding(patch_len, d_model) + + # Positional embedding + # self.position_embedding = PositionalEmbedding(d_model) + + # Residual dropout + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # do patching + n_vars = x.shape[1] + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.value_embedding(x) + return self.dropout(x), n_vars + + +class DataEmbedding_wo_time(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_wo_time, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x): + x = self.value_embedding(x) + self.position_embedding(x) + return self.dropout(x) diff --git a/layers/SelfAttention_Family.py b/layers/SelfAttention_Family.py new file mode 100644 index 0000000..3817b96 --- /dev/null +++ b/layers/SelfAttention_Family.py @@ -0,0 +1,242 @@ +import torch +import torch.nn as nn +import numpy as np +from math import sqrt +from utils.masking import TriangularCausalMask, ProbMask +from reformer_pytorch import LSHSelfAttention + + +class DSAttention(nn.Module): + '''De-stationary Attention''' + + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(DSAttention, self).__init__() + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, H, E = queries.shape + _, S, _, D = values.shape + scale = self.scale or 1. / sqrt(E) + + tau = 1.0 if tau is None else tau.unsqueeze( + 1).unsqueeze(1) # B x 1 x 1 x 1 + delta = 0.0 if delta is None else delta.unsqueeze( + 1).unsqueeze(1) # B x 1 x 1 x S + + # De-stationary Attention, rescaling pre-softmax score with learned de-stationary factors + scores = torch.einsum("blhe,bshe->bhls", queries, keys) * tau + delta + + if self.mask_flag: + if attn_mask is None: + attn_mask = TriangularCausalMask(B, L, device=queries.device) + + scores.masked_fill_(attn_mask.mask, -np.inf) + + A = self.dropout(torch.softmax(scale * scores, dim=-1)) + V = torch.einsum("bhls,bshd->blhd", A, values) + + if self.output_attention: + return (V.contiguous(), A) + else: + return (V.contiguous(), None) + + +class FullAttention(nn.Module): + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(FullAttention, self).__init__() + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, H, E = queries.shape + _, S, _, D = values.shape + scale = self.scale or 1. / sqrt(E) + + scores = torch.einsum("blhe,bshe->bhls", queries, keys) + + if self.mask_flag: + if attn_mask is None: + attn_mask = TriangularCausalMask(B, L, device=queries.device) + + scores.masked_fill_(attn_mask.mask, -np.inf) + + A = self.dropout(torch.softmax(scale * scores, dim=-1)) + V = torch.einsum("bhls,bshd->blhd", A, values) + + if self.output_attention: + return (V.contiguous(), A) + else: + return (V.contiguous(), None) + + +class ProbAttention(nn.Module): + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(ProbAttention, self).__init__() + self.factor = factor + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def _prob_QK(self, Q, K, sample_k, n_top): # n_top: c*ln(L_q) + # Q [B, H, L, D] + B, H, L_K, E = K.shape + _, _, L_Q, _ = Q.shape + + # calculate the sampled Q_K + K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E) + # real U = U_part(factor*ln(L_k))*L_q + index_sample = torch.randint(L_K, (L_Q, sample_k)) + K_sample = K_expand[:, :, torch.arange( + L_Q).unsqueeze(1), index_sample, :] + Q_K_sample = torch.matmul( + Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze() + + # find the Top_k query with sparisty measurement + M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K) + M_top = M.topk(n_top, sorted=False)[1] + + # use the reduced Q to calculate Q_K + Q_reduce = Q[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + M_top, :] # factor*ln(L_q) + Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1)) # factor*ln(L_q)*L_k + + return Q_K, M_top + + def _get_initial_context(self, V, L_Q): + B, H, L_V, D = V.shape + if not self.mask_flag: + # V_sum = V.sum(dim=-2) + V_sum = V.mean(dim=-2) + contex = V_sum.unsqueeze(-2).expand(B, H, + L_Q, V_sum.shape[-1]).clone() + else: # use mask + # requires that L_Q == L_V, i.e. for self-attention only + assert (L_Q == L_V) + contex = V.cumsum(dim=-2) + return contex + + def _update_context(self, context_in, V, scores, index, L_Q, attn_mask): + B, H, L_V, D = V.shape + + if self.mask_flag: + attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device) + scores.masked_fill_(attn_mask.mask, -np.inf) + + attn = torch.softmax(scores, dim=-1) # nn.Softmax(dim=-1)(scores) + + context_in[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + index, :] = torch.matmul(attn, V).type_as(context_in) + if self.output_attention: + attns = (torch.ones([B, H, L_V, L_V]) / + L_V).type_as(attn).to(attn.device) + attns[torch.arange(B)[:, None, None], torch.arange(H)[ + None, :, None], index, :] = attn + return (context_in, attns) + else: + return (context_in, None) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L_Q, H, D = queries.shape + _, L_K, _, _ = keys.shape + + queries = queries.transpose(2, 1) + keys = keys.transpose(2, 1) + values = values.transpose(2, 1) + + U_part = self.factor * \ + np.ceil(np.log(L_K)).astype('int').item() # c*ln(L_k) + u = self.factor * \ + np.ceil(np.log(L_Q)).astype('int').item() # c*ln(L_q) + + U_part = U_part if U_part < L_K else L_K + u = u if u < L_Q else L_Q + + scores_top, index = self._prob_QK( + queries, keys, sample_k=U_part, n_top=u) + + # add scale factor + scale = self.scale or 1. / sqrt(D) + if scale is not None: + scores_top = scores_top * scale + # get the context + context = self._get_initial_context(values, L_Q) + # update the context with selected top_k queries + context, attn = self._update_context( + context, values, scores_top, index, L_Q, attn_mask) + + return context.contiguous(), attn + + +class AttentionLayer(nn.Module): + def __init__(self, attention, d_model, n_heads, d_keys=None, + d_values=None): + super(AttentionLayer, self).__init__() + + d_keys = d_keys or (d_model // n_heads) + d_values = d_values or (d_model // n_heads) + + self.inner_attention = attention + self.query_projection = nn.Linear(d_model, d_keys * n_heads) + self.key_projection = nn.Linear(d_model, d_keys * n_heads) + self.value_projection = nn.Linear(d_model, d_values * n_heads) + self.out_projection = nn.Linear(d_values * n_heads, d_model) + self.n_heads = n_heads + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, _ = queries.shape + _, S, _ = keys.shape + H = self.n_heads + + queries = self.query_projection(queries).view(B, L, H, -1) + keys = self.key_projection(keys).view(B, S, H, -1) + values = self.value_projection(values).view(B, S, H, -1) + + out, attn = self.inner_attention( + queries, + keys, + values, + attn_mask, + tau=tau, + delta=delta + ) + out = out.view(B, L, -1) + + return self.out_projection(out), attn + + +class ReformerLayer(nn.Module): + def __init__(self, attention, d_model, n_heads, d_keys=None, + d_values=None, causal=False, bucket_size=4, n_hashes=4): + super().__init__() + self.bucket_size = bucket_size + self.attn = LSHSelfAttention( + dim=d_model, + heads=n_heads, + bucket_size=bucket_size, + n_hashes=n_hashes, + causal=causal + ) + + def fit_length(self, queries): + # inside reformer: assert N % (bucket_size * 2) == 0 + B, N, C = queries.shape + if N % (self.bucket_size * 2) == 0: + return queries + else: + # fill the time series + fill_len = (self.bucket_size * 2) - (N % (self.bucket_size * 2)) + return torch.cat([queries, torch.zeros([B, fill_len, C]).to(queries.device)], dim=1) + + def forward(self, queries, keys, values, attn_mask, tau, delta): + # in Reformer: defalut queries=keys + B, N, C = queries.shape + queries = self.attn(self.fit_length(queries))[:, :N, :] + return queries, None diff --git a/layers/StandardNorm.py b/layers/StandardNorm.py new file mode 100755 index 0000000..990d0fd --- /dev/null +++ b/layers/StandardNorm.py @@ -0,0 +1,68 @@ +import torch +import torch.nn as nn + + +class Normalize(nn.Module): + def __init__(self, num_features: int, eps=1e-5, affine=False, subtract_last=False, non_norm=False): + """ + :param num_features: the number of features or channels + :param eps: a value added for numerical stability + :param affine: if True, RevIN has learnable affine parameters + """ + super(Normalize, self).__init__() + self.num_features = num_features + self.eps = eps + self.affine = affine + self.subtract_last = subtract_last + self.non_norm = non_norm + if self.affine: + self._init_params() + + def forward(self, x, mode: str): + if mode == 'norm': + self._get_statistics(x) + x = self._normalize(x) + elif mode == 'denorm': + x = self._denormalize(x) + else: + raise NotImplementedError + return x + + def _init_params(self): + # initialize RevIN params: (C,) + self.affine_weight = nn.Parameter(torch.ones(self.num_features)) + self.affine_bias = nn.Parameter(torch.zeros(self.num_features)) + + def _get_statistics(self, x): + dim2reduce = tuple(range(1, x.ndim - 1)) + if self.subtract_last: + self.last = x[:, -1, :].unsqueeze(1) + else: + self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() + self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() + + def _normalize(self, x): + if self.non_norm: + return x + if self.subtract_last: + x = x - self.last + else: + x = x - self.mean + x = x / self.stdev + if self.affine: + x = x * self.affine_weight + x = x + self.affine_bias + return x + + def _denormalize(self, x): + if self.non_norm: + return x + if self.affine: + x = x - self.affine_bias + x = x / (self.affine_weight + self.eps * self.eps) + x = x * self.stdev + if self.subtract_last: + x = x + self.last + else: + x = x + self.mean + return x diff --git a/layers/Transformer_EncDec.py b/layers/Transformer_EncDec.py new file mode 100644 index 0000000..dabf4c2 --- /dev/null +++ b/layers/Transformer_EncDec.py @@ -0,0 +1,135 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ConvLayer(nn.Module): + def __init__(self, c_in): + super(ConvLayer, self).__init__() + self.downConv = nn.Conv1d(in_channels=c_in, + out_channels=c_in, + kernel_size=3, + padding=2, + padding_mode='circular') + self.norm = nn.BatchNorm1d(c_in) + self.activation = nn.ELU() + self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) + + def forward(self, x): + x = self.downConv(x.permute(0, 2, 1)) + x = self.norm(x) + x = self.activation(x) + x = self.maxPool(x) + x = x.transpose(1, 2) + return x + + +class EncoderLayer(nn.Module): + def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"): + super(EncoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.attention = attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, attn_mask=None, tau=None, delta=None): + new_x, attn = self.attention( + x, x, x, + attn_mask=attn_mask, + tau=tau, delta=delta + ) + x = x + self.dropout(new_x) + + y = x = self.norm1(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + + return self.norm2(x + y), attn + + +class Encoder(nn.Module): + def __init__(self, attn_layers, conv_layers=None, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None + self.norm = norm_layer + + def forward(self, x, attn_mask=None, tau=None, delta=None): + # x [B, L, D] + attns = [] + if self.conv_layers is not None: + for i, (attn_layer, conv_layer) in enumerate(zip(self.attn_layers, self.conv_layers)): + delta = delta if i == 0 else None + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + x = conv_layer(x) + attns.append(attn) + x, attn = self.attn_layers[-1](x, tau=tau, delta=None) + attns.append(attn) + else: + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + attns.append(attn) + + if self.norm is not None: + x = self.norm(x) + + return x, attns + + +class DecoderLayer(nn.Module): + def __init__(self, self_attention, cross_attention, d_model, d_ff=None, + dropout=0.1, activation="relu"): + super(DecoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.self_attention = self_attention + self.cross_attention = cross_attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None): + x = x + self.dropout(self.self_attention( + x, x, x, + attn_mask=x_mask, + tau=tau, delta=None + )[0]) + x = self.norm1(x) + + x = x + self.dropout(self.cross_attention( + x, cross, cross, + attn_mask=cross_mask, + tau=tau, delta=delta + )[0]) + + y = x = self.norm2(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + + return self.norm3(x + y) + + +class Decoder(nn.Module): + def __init__(self, layers, norm_layer=None, projection=None): + super(Decoder, self).__init__() + self.layers = nn.ModuleList(layers) + self.norm = norm_layer + self.projection = projection + + def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None): + for layer in self.layers: + x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta) + + if self.norm is not None: + x = self.norm(x) + + if self.projection is not None: + x = self.projection(x) + return x diff --git a/layers/__init__.py b/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/Autoformer.py b/models/Autoformer.py new file mode 100644 index 0000000..8faf0a2 --- /dev/null +++ b/models/Autoformer.py @@ -0,0 +1,158 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from layers.Embed import DataEmbedding, DataEmbedding_wo_pos +from layers.AutoCorrelation import AutoCorrelation, AutoCorrelationLayer +from layers.Autoformer_EncDec import Encoder, Decoder, EncoderLayer, DecoderLayer, my_Layernorm, series_decomp +import math +import numpy as np + + +class Model(nn.Module): + """ + Autoformer is the first method to achieve the series-wise connection, + with inherent O(LlogL) complexity + Paper link: https://openreview.net/pdf?id=I55UqU-M11y + """ + + def __init__(self, configs): + super(Model, self).__init__() + self.task_name = configs.task_name + self.seq_len = configs.seq_len + self.label_len = configs.label_len + self.pred_len = configs.pred_len + self.output_attention = configs.output_attention + + # Decomp + kernel_size = configs.moving_avg + self.decomp = series_decomp(kernel_size) + + # Embedding + self.enc_embedding = DataEmbedding_wo_pos(configs.enc_in, configs.d_model, configs.embed, configs.freq, + configs.dropout) + # Encoder + self.encoder = Encoder( + [ + EncoderLayer( + AutoCorrelationLayer( + AutoCorrelation(False, configs.factor, attention_dropout=configs.dropout, + output_attention=configs.output_attention), + configs.d_model, configs.n_heads), + configs.d_model, + configs.d_ff, + moving_avg=configs.moving_avg, + dropout=configs.dropout, + activation=configs.activation + ) for l in range(configs.e_layers) + ], + norm_layer=my_Layernorm(configs.d_model) + ) + # Decoder + if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast': + self.dec_embedding = DataEmbedding_wo_pos(configs.dec_in, configs.d_model, configs.embed, configs.freq, + configs.dropout) + self.decoder = Decoder( + [ + DecoderLayer( + AutoCorrelationLayer( + AutoCorrelation(True, configs.factor, attention_dropout=configs.dropout, + output_attention=False), + configs.d_model, configs.n_heads), + AutoCorrelationLayer( + AutoCorrelation(False, configs.factor, attention_dropout=configs.dropout, + output_attention=False), + configs.d_model, configs.n_heads), + configs.d_model, + configs.c_out, + configs.d_ff, + moving_avg=configs.moving_avg, + dropout=configs.dropout, + activation=configs.activation, + ) + for l in range(configs.d_layers) + ], + norm_layer=my_Layernorm(configs.d_model), + projection=nn.Linear(configs.d_model, configs.c_out, bias=True) + ) + if self.task_name == 'imputation': + self.projection = nn.Linear( + configs.d_model, configs.c_out, bias=True) + if self.task_name == 'anomaly_detection': + self.projection = nn.Linear( + configs.d_model, configs.c_out, bias=True) + if self.task_name == 'classification': + self.act = F.gelu + self.dropout = nn.Dropout(configs.dropout) + self.projection = nn.Linear( + configs.d_model * configs.seq_len, configs.num_class) + + def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec): + # decomp init + mean = torch.mean(x_enc, dim=1).unsqueeze( + 1).repeat(1, self.pred_len, 1) + zeros = torch.zeros([x_dec.shape[0], self.pred_len, + x_dec.shape[2]], device=x_enc.device) + seasonal_init, trend_init = self.decomp(x_enc) + # decoder input + trend_init = torch.cat( + [trend_init[:, -self.label_len:, :], mean], dim=1) + seasonal_init = torch.cat( + [seasonal_init[:, -self.label_len:, :], zeros], dim=1) + # enc + enc_out = self.enc_embedding(x_enc, x_mark_enc) + enc_out, attns = self.encoder(enc_out, attn_mask=None) + # dec + dec_out = self.dec_embedding(seasonal_init, x_mark_dec) + seasonal_part, trend_part = self.decoder(dec_out, enc_out, x_mask=None, cross_mask=None, + trend=trend_init) + # final + dec_out = trend_part + seasonal_part + return dec_out + + def imputation(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask): + # enc + enc_out = self.enc_embedding(x_enc, x_mark_enc) + enc_out, attns = self.encoder(enc_out, attn_mask=None) + # final + dec_out = self.projection(enc_out) + return dec_out + + def anomaly_detection(self, x_enc): + # enc + enc_out = self.enc_embedding(x_enc, None) + enc_out, attns = self.encoder(enc_out, attn_mask=None) + # final + dec_out = self.projection(enc_out) + return dec_out + + def classification(self, x_enc, x_mark_enc): + # enc + enc_out = self.enc_embedding(x_enc, None) + enc_out, attns = self.encoder(enc_out, attn_mask=None) + + # Output + # the output transformer encoder/decoder embeddings don't include non-linearity + output = self.act(enc_out) + output = self.dropout(output) + # zero-out padding embeddings + output = output * x_mark_enc.unsqueeze(-1) + # (batch_size, seq_length * d_model) + output = output.reshape(output.shape[0], -1) + output = self.projection(output) # (batch_size, num_classes) + return output + + def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None): + if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast': + dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec) + return dec_out[:, -self.pred_len:, :] # [B, L, D] + if self.task_name == 'imputation': + dec_out = self.imputation( + x_enc, x_mark_enc, x_dec, x_mark_dec, mask) + return dec_out # [B, L, D] + if self.task_name == 'anomaly_detection': + dec_out = self.anomaly_detection(x_enc) + return dec_out # [B, L, D] + if self.task_name == 'classification': + dec_out = self.classification(x_enc, x_mark_enc) + return dec_out # [B, N] + return None diff --git a/models/DLinear.py b/models/DLinear.py new file mode 100644 index 0000000..db1559c --- /dev/null +++ b/models/DLinear.py @@ -0,0 +1,107 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from layers.Autoformer_EncDec import series_decomp + + +class Model(nn.Module): + """ + Paper link: https://arxiv.org/pdf/2205.13504.pdf + """ + + def __init__(self, configs, individual=False): + """ + individual: Bool, whether shared model among different variates. + """ + super(Model, self).__init__() + self.task_name = configs.task_name + self.seq_len = configs.seq_len + if self.task_name == 'classification' or self.task_name == 'anomaly_detection' or self.task_name == 'imputation': + self.pred_len = configs.seq_len + else: + self.pred_len = configs.pred_len + + self.decompsition = series_decomp(configs.moving_avg) + self.individual = individual + self.channels = configs.enc_in + + if self.individual: + self.Linear_Seasonal = nn.ModuleList() + self.Linear_Trend = nn.ModuleList() + + for i in range(self.channels): + self.Linear_Seasonal.append( + nn.Linear(self.seq_len, self.pred_len)) + self.Linear_Trend.append( + nn.Linear(self.seq_len, self.pred_len)) + + self.Linear_Seasonal[i].weight = nn.Parameter( + (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len])) + self.Linear_Trend[i].weight = nn.Parameter( + (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len])) + else: + self.Linear_Seasonal = nn.Linear(self.seq_len, self.pred_len) + self.Linear_Trend = nn.Linear(self.seq_len, self.pred_len) + + self.Linear_Seasonal.weight = nn.Parameter( + (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len])) + self.Linear_Trend.weight = nn.Parameter( + (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len])) + + if self.task_name == 'classification': + self.act = F.gelu + self.dropout = nn.Dropout(configs.dropout) + self.projection = nn.Linear( + configs.enc_in * configs.seq_len, configs.num_class) + + def encoder(self, x): + seasonal_init, trend_init = self.decompsition(x) + seasonal_init, trend_init = seasonal_init.permute( + 0, 2, 1), trend_init.permute(0, 2, 1) + if self.individual: + seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.pred_len], + dtype=seasonal_init.dtype).to(seasonal_init.device) + trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.pred_len], + dtype=trend_init.dtype).to(trend_init.device) + for i in range(self.channels): + seasonal_output[:, i, :] = self.Linear_Seasonal[i]( + seasonal_init[:, i, :]) + trend_output[:, i, :] = self.Linear_Trend[i]( + trend_init[:, i, :]) + else: + seasonal_output = self.Linear_Seasonal(seasonal_init) + trend_output = self.Linear_Trend(trend_init) + x = seasonal_output + trend_output + return x.permute(0, 2, 1) + + def forecast(self, x_enc): + return self.encoder(x_enc) + + def imputation(self, x_enc): + return self.encoder(x_enc) + + def anomaly_detection(self, x_enc): + return self.encoder(x_enc) + + def classification(self, x_enc): + enc_out = self.encoder(x_enc) + # Output + # (batch_size, seq_length * d_model) + output = enc_out.reshape(enc_out.shape[0], -1) + output = self.projection(output) # (batch_size, num_classes) + return output + + def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None): + if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast': + dec_out = self.forecast(x_enc) + return dec_out[:, -self.pred_len:, :] # [B, L, D] + if self.task_name == 'imputation': + dec_out = self.imputation(x_enc) + return dec_out # [B, L, D] + if self.task_name == 'anomaly_detection': + dec_out = self.anomaly_detection(x_enc) + return dec_out # [B, L, D] + if self.task_name == 'classification': + dec_out = self.classification(x_enc) + return dec_out # [B, N] + return None diff --git a/models/TimeLLM.py b/models/TimeLLM.py new file mode 100644 index 0000000..7c312bc --- /dev/null +++ b/models/TimeLLM.py @@ -0,0 +1,209 @@ +from math import sqrt + +import torch +import torch.nn as nn + +from transformers import LlamaConfig, LlamaModel, LlamaTokenizer +from layers.Embed import PatchEmbedding + +import transformers + +from layers.StandardNorm import Normalize + +transformers.logging.set_verbosity_error() + + +class FlattenHead(nn.Module): + def __init__(self, n_vars, nf, target_window, head_dropout=0): + super().__init__() + self.n_vars = n_vars + self.flatten = nn.Flatten(start_dim=-2) + self.linear = nn.Linear(nf, target_window) + self.dropout = nn.Dropout(head_dropout) + + def forward(self, x): + x = self.flatten(x) + x = self.linear(x) + x = self.dropout(x) + return x + + +class Model(nn.Module): + + def __init__(self, configs, patch_len=16, stride=8): + super(Model, self).__init__() + self.task_name = configs.task_name + self.pred_len = configs.pred_len + self.seq_len = configs.seq_len + self.d_ff = configs.d_ff + self.top_k = 5 + self.d_llm = 4096 + self.patch_len = configs.patch_len + self.stride = configs.stride + + self.llama_config = LlamaConfig.from_pretrained('/mnt/alps/modelhub/pretrained_model/LLaMA/7B_hf/') + # self.llama_config = LlamaConfig.from_pretrained('huggyllama/llama-7b') + self.llama_config.num_hidden_layers = configs.llm_layers + self.llama_config.output_attentions = True + self.llama_config.output_hidden_states = True + self.llama = LlamaModel.from_pretrained( + "/mnt/alps/modelhub/pretrained_model/LLaMA/7B_hf/", + # 'huggyllama/llama-7b', + trust_remote_code=True, + local_files_only=True, + config=self.llama_config, + load_in_4bit=True + ) + + self.tokenizer = LlamaTokenizer.from_pretrained( + "/mnt/alps/modelhub/pretrained_model/LLaMA/7B_hf/tokenizer.model", + # 'huggyllama/llama-7b', + trust_remote_code=True, + local_files_only=True + ) + + if self.tokenizer.eos_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + else: + pad_token = '[PAD]' + self.tokenizer.add_special_tokens({'pad_token': pad_token}) + self.tokenizer.pad_token = pad_token + + for param in self.llama.parameters(): + param.requires_grad = False + + self.dropout = nn.Dropout(configs.dropout) + + self.patch_embedding = PatchEmbedding( + configs.d_model, self.patch_len, self.stride, configs.dropout) + + self.word_embeddings = self.llama.get_input_embeddings().weight + self.vocab_size = self.word_embeddings.shape[0] + self.num_tokens = 1000 + self.mapping_layer = nn.Linear(self.vocab_size, self.num_tokens) + + self.reprogramming_layer = ReprogrammingLayer(configs.d_model, configs.n_heads, self.d_ff, self.d_llm) + + self.patch_nums = int((configs.seq_len - self.patch_len) / self.stride + 2) + self.head_nf = self.d_ff * self.patch_nums + + if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast': + self.output_projection = FlattenHead(configs.enc_in, self.head_nf, self.pred_len, + head_dropout=configs.dropout) + else: + raise NotImplementedError + + self.normalize_layers = Normalize(configs.enc_in, affine=False) + + def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None): + if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast': + dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec) + return dec_out[:, -self.pred_len:, :] + return None + + def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec): + + x_enc = self.normalize_layers(x_enc, 'norm') + + B, T, N = x_enc.size() + x_enc = x_enc.permute(0, 2, 1).contiguous().reshape(B * N, T, 1) + + min_values = torch.min(x_enc, dim=1)[0] + max_values = torch.max(x_enc, dim=1)[0] + medians = torch.median(x_enc, dim=1).values + lags = self.calcute_lags(x_enc) + trends = x_enc.diff(dim=1).sum(dim=1) + + prompt = [] + for b in range(x_enc.shape[0]): + min_values_str = str(min_values[b].tolist()[0]) + max_values_str = str(max_values[b].tolist()[0]) + median_values_str = str(medians[b].tolist()[0]) + lags_values_str = str(lags[b].tolist()) + prompt_ = ( + f"<|start_prompt|>Dataset description: The Electricity Transformer Temperature (ETT) is a crucial indicator in the electric power long-term deployment." + f"Task description: forecast the next {str(self.pred_len)} steps given the previous {str(self.seq_len)} steps information; " + "Input statistics: " + f"min value {min_values_str}, " + f"max value {max_values_str}, " + f"median value {median_values_str}, " + f"the trend of input is {'upward' if trends[b] > 0 else 'downward'}, " + f"top 5 lags are : {lags_values_str}<|