cube-studio/install/docker/config.py

882 lines
32 KiB
Python
Raw Normal View History

2021-08-17 17:00:34 +08:00
import imp
import json
import os
import sys
from dateutil import tz
from flask_appbuilder.security.manager import AUTH_REMOTE_USER, AUTH_DB
from myapp.stats_logger import DummyStatsLogger
# Realtime stats logger, a StatsD implementation exists
STATS_LOGGER = DummyStatsLogger()
BASE_DIR = os.path.abspath(os.path.dirname(__file__))
if "MYAPP_HOME" in os.environ:
DATA_DIR = os.environ["MYAPP_HOME"]
else:
DATA_DIR = os.path.join(os.path.expanduser("~"), ".myapp")
APP_THEME = "readable.css"
FAB_UPDATE_PERMS=True
FAB_STATIC_FOLDER = BASE_DIR + "/static/appbuilder/"
# ---------------------------------------------------------
# Myapp specific config
# ---------------------------------------------------------
# PACKAGE_DIR = os.path.join(BASE_DIR, "static", "assets")
# PACKAGE_FILE = os.path.join(PACKAGE_DIR, "package.json")
# with open(PACKAGE_FILE) as package_file:
# VERSION_STRING = json.load(package_file)["version"]
MYAPP_WORKERS = 2 # deprecated
MYAPP_CELERY_WORKERS = 32 # deprecated
MYAPP_WEBSERVER_ADDRESS = "0.0.0.0"
MYAPP_WEBSERVER_PORT = 80
# This is an important setting, and should be lower than your
# [load balancer / proxy / envoy / kong / ...] timeout settings.
# You should also make sure to configure your WSGI server
# (gunicorn, nginx, apache, ...) timeout setting to be <= to this setting
MYAPP_WEBSERVER_TIMEOUT = 300
# Your App secret key
# 设置才能正常使用session
SECRET_KEY = "\2\1thisismyscretkey\1\2\e\y\y\h" # noqa
# The SQLAlchemy connection string.
# SQLALCHEMY_DATABASE_URI = "sqlite:///" + os.path.join(DATA_DIR, "myapp.db")
# SQLALCHEMY_DATABASE_URI = 'mysql://myapp@localhost/myapp'
# SQLALCHEMY_DATABASE_URI = 'postgresql://root:password@localhost/myapp'
# SQLALCHEMY_DATABASE_URI = os.getenv('MYSQL_SERVICE','mysql+pymysql://root:admin@127.0.0.1:3306/myapp?charset=utf8') # default must set None
CSV_EXPORT = {"encoding": "utf_8_sig"}
# Flask-WTF flag for CSRF
WTF_CSRF_ENABLED = False
# Add endpoints that need to be exempt from CSRF protection
WTF_CSRF_EXEMPT_LIST = ["myapp.views.core.log"]
# Whether to run the web server in debug mode or not
DEBUG = os.environ.get("FLASK_ENV") == "development"
FLASK_USE_RELOAD = True
# Myapp allows server-side python stacktraces to be surfaced to the
# user when this feature is on. This may has security implications
# and it's more secure to turn it off in production settings.
SHOW_STACKTRACE = True
# Extract and use X-Forwarded-For/X-Forwarded-Proto headers?
ENABLE_PROXY_FIX = False
# ------------------------------
# GLOBALS FOR APP Builder
# ------------------------------
# Uncomment to setup Your App name
APP_NAME = "Kubeflow"
# Uncomment to setup an App icon
2021-09-07 18:04:14 +08:00
APP_ICON = "/static/assets/images/myapp-logo.png"
2021-08-17 17:00:34 +08:00
APP_ICON_WIDTH = 126
# Uncomment to specify where clicking the logo would take the user
# e.g. setting it to '/welcome' would take the user to '/myapp/welcome'
LOGO_TARGET_PATH = None
# ----------------------------------------------------
# AUTHENTICATION CONFIG
# ----------------------------------------------------
# The authentication type
# AUTH_OID : Is for OpenID
# AUTH_DB : Is for database (username/password()
# AUTH_LDAP : Is for LDAP
# AUTH_REMOTE_USER : Is for using REMOTE_USER from web server
AUTH_TYPE = AUTH_DB
# AUTH_TYPE = AUTH_REMOTE_USER
# Uncomment to setup Full admin role name
# AUTH_ROLE_ADMIN = 'Admin'
# Uncomment to setup Public role name, no authentication needed
# AUTH_ROLE_PUBLIC = 'Public'
# Will allow user self registration
AUTH_USER_REGISTRATION = False
# The default user self registration role
AUTH_USER_REGISTRATION_ROLE = "Gamma"
# RECAPTCHA_PUBLIC_KEY = 'GOOGLE PUBLIC KEY FOR RECAPTCHA'
# RECAPTCHA_PRIVATE_KEY = 'GOOGLE PRIVATE KEY FOR RECAPTCHA'
OAUTH_PROVIDERS=[]
# When using LDAP Auth, setup the ldap server
# AUTH_LDAP_SERVER = "ldap://ldapserver.new"
# Uncomment to setup OpenID providers example for OpenID authentication
# OPENID_PROVIDERS = [
# { 'name': 'Yahoo', 'url': 'https://open.login.yahoo.com/' },
# { 'name': 'Flickr', 'url': 'https://www.flickr.com/<username>' },
# ---------------------------------------------------
# Babel config for translations
# ---------------------------------------------------
# Setup default language
BABEL_DEFAULT_LOCALE = "en"
# Your application default translation path
BABEL_DEFAULT_FOLDER = "myapp/translations"
# The allowed translation for you app
LANGUAGES = {
"en": {"flag": "us", "name": "English"},
"zh": {"flag": "cn", "name": "Chinese"},
}
# ---------------------------------------------------
# Feature flags
# ---------------------------------------------------
# Feature flags that are set by default go here. Their values can be
# For example, DEFAULT_FEATURE_FLAGS = { 'FOO': True, 'BAR': False } here
# and FEATURE_FLAGS = { 'BAR': True, 'BAZ': True } in myapp_config.py
# will result in combined feature flags of { 'FOO': True, 'BAR': True, 'BAZ': True }
DEFAULT_FEATURE_FLAGS = {
# Experimental feature introducing a client (browser) cache
"CLIENT_CACHE": False,
"ENABLE_EXPLORE_JSON_CSRF_PROTECTION": False,
}
# A function that receives a dict of all feature flags
# (DEFAULT_FEATURE_FLAGS merged with FEATURE_FLAGS)
# can alter it, and returns a similar dict. Note the dict of feature
# flags passed to the function is a deepcopy of the dict in the config,
# and can therefore be mutated without side-effect
#
# GET_FEATURE_FLAGS_FUNC can be used to implement progressive rollouts,
# role-based features, or a full on A/B testing framework.
#
# from flask import g, request
# def GET_FEATURE_FLAGS_FUNC(feature_flags_dict):
# feature_flags_dict['some_feature'] = g.user and g.user.id == 5
# return feature_flags_dict
GET_FEATURE_FLAGS_FUNC = None
# ---------------------------------------------------
# Image and file configuration
# ---------------------------------------------------
# The file upload folder, when using models with files
UPLOAD_FOLDER = BASE_DIR + "/static/file/uploads/"
DOWNLOAD_FOLDER = BASE_DIR + "/static/file/download/"
DOWNLOAD_URL = "/static/file/download/"
# The image upload folder, when using models with images
IMG_UPLOAD_FOLDER = BASE_DIR + "/static/file/uploads/"
# The image upload url, when using models with images
IMG_UPLOAD_URL = "/static/file/uploads/"
# Setup image size default is (300, 200, True)
# IMG_SIZE = (300, 200, True)
CACHE_DEFAULT_TIMEOUT = 60 * 60 * 24 # cache默认超时是24小时一天才过期
CACHE_CONFIG = {"CACHE_TYPE": "null"} # 不使用缓存
# CORS Options
ENABLE_CORS = True
CORS_OPTIONS = {"supports_credentials":True}
# Chrome allows up to 6 open connections per domain at a time. When there are more
# than 6 slices in dashboard, a lot of time fetch requests are queued up and wait for
# next available socket. PR #5039 is trying to allow domain sharding for Myapp,
# and this feature will be enabled by configuration only (by default Myapp
# doesn't allow cross-domain request).
MYAPP_WEBSERVER_DOMAINS = None
# ---------------------------------------------------
# Time grain configurations
# ---------------------------------------------------
# List of time grains to disable in the application (see list of builtin
# time grains in myapp/db_engine_specs.builtin_time_grains).
# For example: to disable 1 second time grain:
# TIME_GRAIN_BLACKLIST = ['PT1S']
TIME_GRAIN_BLACKLIST = []
# Additional time grains to be supported using similar definitions as in
# myapp/db_engine_specs.builtin_time_grains.
# For example: To add a new 2 second time grain:
# TIME_GRAIN_ADDONS = {'PT2S': '2 second'}
TIME_GRAIN_ADDONS = {}
# Implementation of additional time grains per engine.
# For example: To implement 2 second time grain on clickhouse engine:
# TIME_GRAIN_ADDON_FUNCTIONS = {
# 'clickhouse': {
# 'PT2S': 'toDateTime(intDiv(toUInt32(toDateTime({col})), 2)*2)'
# }
# }
TIME_GRAIN_ADDON_FUNCTIONS = {}
# Console Log Settings
ADDITIONAL_MODULE_DS_MAP = {}
ADDITIONAL_MIDDLEWARE = []
LOG_FORMAT = "%(asctime)s:%(levelname)s:%(name)s:%(message)s"
LOG_LEVEL = "DEBUG"
# ---------------------------------------------------
# Enable Time Rotate Log Handler
# ---------------------------------------------------
# LOG_LEVEL = DEBUG, INFO, WARNING, ERROR, CRITICAL
ENABLE_TIME_ROTATE = False
TIME_ROTATE_LOG_LEVEL = "DEBUG"
FILENAME = os.path.join(DATA_DIR, "myapp.log")
ROLLOVER = "midnight"
INTERVAL = 1
BACKUP_COUNT = 30
# Custom logger for auditing queries. This can be used to send ran queries to a
# structured immutable store for auditing purposes. The function is called for
# every query ran, in both SQL Lab and charts/dashboards.
# def QUERY_LOGGER(
# database,
# query,
# schema=None,
# user=None,
# client=None,
# security_manager=None,
# ):
# pass
# Set this API key to enable Mapbox visualizations
MAPBOX_API_KEY = os.environ.get("MAPBOX_API_KEY", "")
# If defined, shows this text in an alert-warning box in the navbar
# one example use case may be "STAGING" to make it clear that this is
# not the production version of the site.
WARNING_MSG = None
from celery.schedules import crontab
from werkzeug.contrib.cache import RedisCache
# Additional static HTTP headers to be served by your Myapp server. Note
# Flask-Talisman aplies the relevant security HTTP headers.
HTTP_HEADERS = {
"Access-Control-Allow-Origin":"*",
"Access-Control-Allow-Methods":"*",
"Access-Control-Allow-Headers":"*",
}
# A dictionary of items that gets merged into the Jinja context for
# SQL Lab. The existing context gets updated with this dictionary,
# meaning values for existing keys get overwritten by the content of this
# dictionary.
JINJA_CONTEXT_ADDONS = {}
# Roles that are controlled by the API / Myapp and should not be changes
# by humans.
ROBOT_PERMISSION_ROLES = ["Gamma", "Admin"]
CONFIG_PATH_ENV_VAR = "MYAPP_CONFIG_PATH"
# If a callable is specified, it will be called at app startup while passing
# a reference to the Flask app. This can be used to alter the Flask app
# in whatever way.
# example: FLASK_APP_MUTATOR = lambda x: x.before_request = f
FLASK_APP_MUTATOR = None
# Set this to false if you don't want users to be able to request/grant
ENABLE_ACCESS_REQUEST = True
# smtp server configuration
EMAIL_NOTIFICATIONS = False # all the emails are sent using dryrun
SMTP_HOST = "localhost"
SMTP_STARTTLS = True
SMTP_SSL = False
SMTP_USER = ""
SMTP_PORT = 25
SMTP_PASSWORD = ""
SMTP_MAIL_FROM = ""
if not CACHE_DEFAULT_TIMEOUT:
CACHE_DEFAULT_TIMEOUT = CACHE_CONFIG.get("CACHE_DEFAULT_TIMEOUT")
# Whether to bump the logging level to ERROR on the flask_appbuilder package
# Set to False if/when debugging FAB related issues like
# permission management
SILENCE_FAB = True
# The link to a page containing common errors and their resolutions
# It will be appended at the bottom of sql_lab errors.
TROUBLESHOOTING_LINK = ""
# CSRF token timeout, set to None for a token that never expires
WTF_CSRF_TIME_LIMIT = 60 * 60 * 24 * 7
# This link should lead to a page with instructions on how to gain access to a
PERMISSION_INSTRUCTIONS_LINK = ""
# Integrate external Blueprints to the app by passing them to your
# configuration. These blueprints will get integrated in the app
BLUEPRINTS = []
# Provide a callable that receives a tracking_url and returns another
# URL. This is used to translate internal Hadoop job tracker URL
# into a proxied one
TRACKING_URL_TRANSFORMER = lambda x: x # noqa: E731
# Allow for javascript controls components
# this enables programmers to customize certain charts (like the
# geospatial ones) by inputing javascript in controls. This exposes
# an XSS security vulnerability
ENABLE_JAVASCRIPT_CONTROLS = False
# A callable that allows altering the database conneciton URL and params
# on the fly, at runtime. This allows for things like impersonation or
# arbitrary logic. For instance you can wire different users to
# use different connection parameters, or pass their email address as the
# username. The function receives the connection uri object, connection
# params, the username, and returns the mutated uri and params objects.
# Example:
# def DB_CONNECTION_MUTATOR(uri, params, username, security_manager, source):
# user = security_manager.find_user(username=username)
# if user and user.email:
# uri.username = user.email
# return uri, params
#
# Note that the returned uri and params are passed directly to sqlalchemy's
# as such `create_engine(url, **params)`
DB_CONNECTION_MUTATOR = None
# When not using gunicorn, (nginx for instance), you may want to disable
# using flask-compress
ENABLE_FLASK_COMPRESS = True
# Enable / disable scheduled email reports
ENABLE_SCHEDULED_EMAIL_REPORTS = True
# If enabled, certail features are run in debug mode
# Current list:
# * Emails are sent using dry-run mode (logging only)
SCHEDULED_EMAIL_DEBUG_MODE = True
# 任务的最小执行间隔min
2021-11-25 18:04:34 +08:00
PIPELINE_TASK_CRON_RESOLUTION = 10
2021-08-17 17:00:34 +08:00
# Email report configuration
# From address in emails
EMAIL_REPORT_FROM_ADDRESS = "reports@myapp.org"
# Send bcc of all reports to this address. Set to None to disable.
# This is useful for maintaining an audit trail of all email deliveries.
# User credentials to use for generating reports
# This user should have permissions to browse all the dashboards and
# slices.
# TODO: In the future, login as the owner of the item to generate reports
EMAIL_REPORTS_USER = "admin"
EMAIL_REPORTS_SUBJECT_PREFIX = "[Report] "
# The webdriver to use for generating reports. Use one of the following
# firefox
# Requires: geckodriver and firefox installations
# Limitations: can be buggy at times
# chrome:
# Requires: headless chrome
# Limitations: unable to generate screenshots of elements
EMAIL_REPORTS_WEBDRIVER = "chrome"
# Any config options to be passed as-is to the webdriver
WEBDRIVER_CONFIGURATION = {}
# The base URL to query for accessing the user interface
# Send user to a link where they can report bugs
BUG_REPORT_URL = None
# Send user to a link where they can read more about Myapp
DOCUMENTATION_URL = None
# Do you want Talisman enabled?
TALISMAN_ENABLED = False
# If you want Talisman, how do you want it configured??
TALISMAN_CONFIG = {
"content_security_policy": None,
"force_https": True,
"force_https_permanent": False,
}
try:
if CONFIG_PATH_ENV_VAR in os.environ:
# Explicitly import config module that is not in pythonpath; useful
# for case where app is being executed via pex.
print(
"Loaded your LOCAL configuration at [{}]".format(
os.environ[CONFIG_PATH_ENV_VAR]
)
)
module = sys.modules[__name__]
override_conf = imp.load_source(
"myapp_config", os.environ[CONFIG_PATH_ENV_VAR]
)
for key in dir(override_conf):
if key.isupper():
setattr(module, key, getattr(override_conf, key))
else:
from myapp_config import * # noqa
import myapp_config
print(
"Loaded your LOCAL configuration at [{}]".format(myapp_config.__file__)
)
except ImportError:
pass
def get_env_variable(var_name, default=None):
"""Get the environment variable or raise exception."""
try:
return os.environ[var_name]
except KeyError:
if default is not None:
return default
else:
error_msg = 'The environment variable {} was missing, abort...'.format(var_name)
raise EnvironmentError(error_msg)
# 数据库连接池的配置
SQLALCHEMY_POOL_SIZE = 100
SQLALCHEMY_POOL_RECYCLE = 300 # 超时重连, 必须小于数据库的超时终端时间
SQLALCHEMY_MAX_OVERFLOW = 300
SQLALCHEMY_TRACK_MODIFICATIONS=False
# redis的配置
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', 'admin') # default must set None
REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
REDIS_PORT = os.getenv('REDIS_PORT', '6379')
WEBDRIVER_BASEURL=os.getenv('WEBDRIVER_BASEURL', 'http://127.0.0.1/')
SQLALCHEMY_DATABASE_URI = os.getenv('MYSQL_SERVICE','mysql+pymysql://root:admin@127.0.0.1:3306/myapp?charset=utf8') # default must set None
from celery.schedules import crontab
from werkzeug.contrib.cache import RedisCache
RESULTS_BACKEND = RedisCache(
host=REDIS_HOST, port=int(REDIS_PORT), key_prefix='myapp_results',password=REDIS_PASSWORD)
class CeleryConfig(object):
# 任务队列
BROKER_URL = 'redis://:%s@%s:%s/0'%(REDIS_PASSWORD,REDIS_HOST,str(REDIS_PORT)) if REDIS_PASSWORD else 'redis://%s:%s/0'%(REDIS_HOST,str(REDIS_PORT))
# celery_task的定义模块
CELERY_IMPORTS = (
'myapp.tasks',
)
# 结果存储
CELERY_RESULT_BACKEND = 'redis://:%s@%s:%s/0'%(REDIS_PASSWORD,REDIS_HOST,str(REDIS_PORT)) if REDIS_PASSWORD else 'redis://%s:%s/0'%(REDIS_HOST,str(REDIS_PORT))
CELERYD_LOG_LEVEL = 'DEBUG'
# celery worker每次去redis取任务的数量
CELERYD_PREFETCH_MULTIPLIER = 10
# 每个worker执行了多少次任务后就会死掉建议数量大一些
# CELERYD_MAX_TASKS_PER_CHILD = 200
# celery任务执行结果的超时时间
# CELERY_TASK_RESULT_EXPIRES = 1200
# 单个任务的运行时间限制,否则会被杀死
# CELERYD_TASK_TIME_LIMIT = 60
# 任务发送完成是否需要确认,对性能会稍有影响
CELERY_ACKS_LATE = True
2022-02-26 22:36:40 +08:00
CELERY_SEND_TASK_SENT_EVENT = True
2021-08-17 17:00:34 +08:00
# celery worker的并发数默认是服务器的内核数目, 也是命令行 - c参数指定的数目
# CELERYD_CONCURRENCY = 4
CELERY_TIMEZONE = 'Asia/Shanghai'
CELERY_ENABLE_UTC = False
# 任务的限制key是celery_task的name值是限制配置
CELERY_ANNOTATIONS = {
'task.delete_tfjob': {
'rate_limit': '1/h',
# 'time_limit': 1200, # 就是 hard_time_limit 不可以catch
'soft_time_limit': 1200, # 运行时长限制soft_time_limit 可以内catch
'ignore_result': True,
},
'task.run_workflow': {
'rate_limit': '1/s',
# 'time_limit': 1,
'soft_time_limit': 600, # 只在 prefork pool 里支持
'ignore_result': True,
2021-10-14 17:21:29 +08:00
},
'task.check_docker_commit': {
'rate_limit': '1/s',
'ignore_result': True,
},
'task.upload_workflow': {
'rate_limit': '10/s',
'ignore_result': True,
2021-08-17 17:00:34 +08:00
}
}
# 定时任务的配置项key为celery_task的name值是调度配置
CELERYBEAT_SCHEDULE = {
'task_task1': {
'task': 'task.delete_tfjob',
# 'schedule': 10.0,
'schedule': crontab(minute='1'),
},
'task_task2': {
2021-09-07 18:04:14 +08:00
'task': 'task.make_timerun_config',
2021-08-17 17:00:34 +08:00
# 'schedule': 10.0, #10s中执行一次
2021-09-07 18:04:14 +08:00
'schedule': crontab(minute='*/5'),
2021-08-17 17:00:34 +08:00
},
2021-10-14 17:21:29 +08:00
# 'task_task3': {
# 'task': 'task.upload_timerun',
# # 'schedule': 10.0, #10s中执行一次
# 'schedule': crontab(minute='*/5'),
# },
2021-08-17 17:00:34 +08:00
'task_task4': {
2021-09-07 18:04:14 +08:00
'task': 'task.delete_old_data',
2021-08-17 17:00:34 +08:00
# 'schedule': 100.0, #10s中执行一次
2021-09-07 18:04:14 +08:00
'schedule': crontab(minute='1', hour='1'),
2021-08-17 17:00:34 +08:00
},
'task_task5': {
'task': 'task.delete_notebook',
# 'schedule': 10.0,
2021-10-14 17:21:29 +08:00
'schedule': crontab(minute='1', hour='4'),
2021-08-17 17:00:34 +08:00
},
# 'task_task6': {
# 'task': 'task.push_workspace_size',
# # 'schedule': 10.0,
# 'schedule': crontab(minute='10', hour='10'),
# },
2021-10-14 17:21:29 +08:00
'task_task6':{
'task':"task.check_pipeline_run",
'schedule': crontab(minute='10', hour='11'),
},
'task_task7': {
'task': 'task.list_train_model',
# 'schedule': 10.0,
'schedule': crontab(minute='20', hour='10'),
},
2021-08-17 17:00:34 +08:00
'task_task8': {
2021-10-14 17:21:29 +08:00
'task': 'task.delete_debug_docker',
# 'schedule': 10.0,
2021-11-25 18:04:34 +08:00
'schedule': crontab(minute='30', hour='22'),
2021-10-14 17:21:29 +08:00
},
'task_task9': {
'task': 'task.watch_gpu',
2021-08-17 17:00:34 +08:00
# 'schedule': 10.0,
2021-10-14 17:21:29 +08:00
'schedule': crontab(minute='10',hour='8-23/2'),
2021-11-25 18:04:34 +08:00
},
'task_task10': {
'task': 'task.adjust_node_resource',
# 'schedule': 10.0,
'schedule': crontab(minute='*/10'),
2021-08-17 17:00:34 +08:00
}
}
2022-05-23 15:13:53 +08:00
DOCUMENTATION_URL='https://github.com/tencentmusic/cube-studio/tree/master/docs/example' # 帮助文档地址显示在web导航栏
2021-08-17 17:00:34 +08:00
ROBOT_PERMISSION_ROLES=[] # 角色黑名单
FAB_API_MAX_PAGE_SIZE=100 # 最大翻页数目不设置的话就会是20
CACHE_DEFAULT_TIMEOUT = 10*60 # 缓存默认过期时间10分钟才过期
2022-02-26 22:36:40 +08:00
# CACHE_CONFIG = {
# 'CACHE_TYPE': 'redis', # 使用 Redis
# 'CACHE_REDIS_HOST': REDIS_HOST, # 配置域名
# 'CACHE_REDIS_PORT': int(REDIS_PORT), # 配置端口号
# 'CACHE_REDIS_URL':'redis://:%s@%s:%s/0'%(REDIS_PASSWORD,REDIS_HOST,str(REDIS_PORT)) if REDIS_PASSWORD else 'redis://%s:%s/0'%(REDIS_HOST,str(REDIS_PORT)) # 01为数据库编号redis有0-16个数据库
# }
2021-08-17 17:00:34 +08:00
CELERY_CONFIG = CeleryConfig
REMEMBER_COOKIE_NAME="remember_token" # 使用cookie认证用户的方式
AUTH_HEADER_NAME = 'Authorization' # header方式认证的header 头
# k8s中用到的各种自动自定义资源
# timeout为自定义资源需要定期删除时设置的过期时长。创建timeout秒的实例会被认为是太久远的实例方便及时清理过期任务
CRD_INFO={
"workflow":{
"group":"argoproj.io",
"version":"v1alpha1",
"plural":"workflows",
'kind':'Workflow',
"timeout": 60*60*24*2
},
"mpijob": {
"group": "kubeflow.org",
"version": "v1",
"plural": "mpijobs",
'kind': 'MPIJob',
"timeout": 60 * 60 * 24 * 2
},
"tfjob": {
"group": "kubeflow.org",
"version": "v1",
"plural": "tfjobs",
'kind':'TFJob',
"timeout": 60*60*24*2
},
"xgbjob": {
"group": "xgboostjob.kubeflow.org",
"version": "v1alpha1",
"plural": "xgboostjobs",
"timeout": 60*60*24*2
},
"experiment":{
"group": "kubeflow.org",
"version": 'v1alpha3', # "v1alpha3",
"plural": "experiments",
'kind':'Experiment',
"timeout": 60 * 60 * 24 * 2
},
"pytorchjob": {
"group": "kubeflow.org",
"version": "v1",
'kind':'PyTorchJob',
"plural": "pytorchjobs",
"timeout": 60 * 60 * 24 * 2
},
"notebook": {
"group": "kubeflow.org",
"version": "v1",
"plural": "notebooks",
'kind':'Notebook',
"timeout": 60 * 60 * 24 * 2
},
"inferenceservice": {
2022-02-26 22:36:40 +08:00
"group": "serving.kserve.io",
"version": "v1beta1",
2021-08-17 17:00:34 +08:00
"plural": "inferenceservices",
'kind':'InferenceService',
"timeout": 60 * 60 * 24 * 1
},
"virtualservice": {
"group": "networking.istio.io",
"version": "v1alpha3",
"plural": "virtualservices",
'kind': 'VirtualService',
"timeout": 60 * 60 * 24 * 1
},
"framework": {
"group": "frameworkcontroller.microsoft.com",
"version": "v1",
"plural": "frameworks",
'kind': 'Framework',
"timeout": 60 * 60 * 24 * 2
2021-10-14 17:21:29 +08:00
},
"vcjob": {
"group": "batch.volcano.sh",
"version": "v1alpha1",
'kind': 'Job',
"plural": "jobs",
"timeout": 60 * 60 * 24 * 2
2021-08-17 17:00:34 +08:00
}
}
# 每个task都会携带的任务环境变量{{}}模板变量会在插入前进行渲染
GLOBAL_ENV={
"KFJ_PIPELINE_ID":"{{pipeline_id}}",
"KFJ_RUN_ID":"{{uuid.uuid4().hex}}",
"KFJ_CREATOR":"{{creator}}",
"KFJ_RUNNER":"{{runner}}",
"KFJ_MODEL_REPO_API_URL":"http://kubeflow-dashboard.infra",
"KFJ_ARCHIVE_BASE_PATH":"/archives",
"KFJ_PIPELINE_NAME":"{{pipeline_name}}",
"KFJ_NAMESPACE":"pipeline",
"KFJ_GPU_TYPE": os.environ.get("GPU_TYPE", "NVIDIA"),
"GPU_TYPE": os.environ.get("GPU_TYPE", "NVIDIA"),
"KFJ_GPU_MEM_MIN":"13G",
"KFJ_GPU_MEM_MAX":"13G",
2021-09-07 18:04:14 +08:00
"KFJ_ENVIRONMENT":"{{cluster_name}}",
2021-08-17 17:00:34 +08:00
}
GPU_TYPE = os.environ.get("GPU_TYPE", "NVIDIA")
TASK_GPU_TYPE='NVIDIA'
SERVICE_GPU_TYPE='NVIDIA'
NOTEBOOK_GPU_TYPE='NVIDIA'
# 各类model list界面的帮助文档
HELP_URL={
2022-05-24 11:53:31 +08:00
"pipeline":"https://github.com/tencentmusic/cube-studio/tree/master/docs/example",
"job_template":"https://github.com/tencentmusic/cube-studio/tree/master/job-template",
"task":"https://github.com/tencentmusic/cube-studio/tree/master/docs/example",
"nni":"https://github.com/tencentmusic/cube-studio/tree/master/docs/example",
"images":"https://github.com/tencentmusic/cube-studio/tree/master/images",
"notebook":"https://github.com/tencentmusic/cube-studio/tree/master/docs/example",
"service":"https://github.com/tencentmusic/cube-studio/tree/master/docs/example",
"inferenceservice":"https://github.com/tencentmusic/cube-studio/tree/master/docs/example",
"run":"https://github.com/tencentmusic/cube-studio/tree/master/docs/example",
2022-05-24 17:32:59 +08:00
"docker":"https://github.com/tencentmusic/cube-studio/tree/master/images"
2021-08-17 17:00:34 +08:00
}
# 不使用模板中定义的镜像而直接使用用户镜像的模板名称
CUSTOMIZE_JOB='自定义镜像'
PIPELINE_TASK_BCC_ADDRESS = 'admin'
ADMIN_USER='admin'
PIPELINE_NAMESPACE = 'pipeline'
2022-02-26 22:36:40 +08:00
SERVICE_PIPELINE_NAMESPACE='service'
2021-08-17 17:00:34 +08:00
KATIB_NAMESPACE = 'katib'
NOTEBOOK_NAMESPACE = 'jupyter'
SERVICE_NAMESPACE = 'service'
2022-02-26 22:36:40 +08:00
PRE_SERVICE_NAMESPACE = 'pre-service'
2021-08-17 17:00:34 +08:00
KFSERVING_NAMESPACE = 'kfserving'
2022-02-26 22:36:40 +08:00
SERVICE_PIPELINE_ZIPKIN='http://xx.xx.xx.xx:9401'
SERVICE_PIPELINE_JAEGER='tracing.service'
2021-09-07 18:04:14 +08:00
2022-06-08 16:47:57 +08:00
KATIB_JOB_DEFAULT_IMAGE='ccr.ccs.tencentyun.com/cube-studio/katib'
2021-08-17 17:00:34 +08:00
KATIB_TFJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0'
KATIB_PYTORCHJOB_DEFAULT_IMAGE = 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0'
# 拉取私有仓库镜像默认携带的k8s hubsecret名称
HUBSECRET = ['hubsecret']
2021-10-14 17:21:29 +08:00
# 私有仓库的组织名,用户在线构建的镜像自动推送这个组织下面
2022-06-08 19:15:22 +08:00
REPOSITORY_ORG='ccr.ccs.tencentyun.com/cube-studio/'
2021-08-17 17:00:34 +08:00
# notebook每个pod使用的用户账号
JUPYTER_ACCOUNTS='jupyter-user'
HUBSECRET_NAMESPACE=[PIPELINE_NAMESPACE,KATIB_NAMESPACE,NOTEBOOK_NAMESPACE,SERVICE_NAMESPACE,KFSERVING_NAMESPACE]
# notebook使用的镜像
NOTEBOOK_IMAGES=[
2022-06-08 16:47:57 +08:00
['ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-cpu-base', 'vscodecpu'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:vscode-ubuntu-gpu-base', 'vscodegpu'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-base', 'jupytercpu'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-gpu-base','jupytergpu'],
['ccr.ccs.tencentyun.com/cube-studio/notebook:jupyter-ubuntu-cpu-1.0.0', 'jupytertensorboard'],
2021-08-17 17:00:34 +08:00
]
# 定时检查大小的目录列表。需要再celery中启动检查任务
CHECK_WORKSPACE_SIZE = [
"/data/k8s/kubeflow/pipeline/workspace",
"/data/k8s/kubeflow/pipeline/archives",
]
# 定时定时检查清理旧数据的目录
DELETE_OLD_DATA = [
"/data/k8s/kubeflow/minio/mlpipeline",
"/data/k8s/kubeflow/minio/mlpipeline/pipelines",
"/data/k8s/kubeflow/minio/mlpipeline/artifacts"
]
# 用户工作目录,下面会自动创建每个用户的个人目录
WORKSPACE_HOST_PATH = '/data/k8s/kubeflow/pipeline/workspace'
# 每个用户的归档目录,可以用来存储训练模型
ARCHIVES_HOST_PATH = "/data/k8s/kubeflow/pipeline/archives"
# prometheus地址
2021-09-07 18:04:14 +08:00
PROMETHEUS = 'prometheus-k8s.monitoring:9090'
2021-09-08 16:43:34 +08:00
KFP_HOST = "http://ml-pipeline.kubeflow:8888"
2021-08-17 17:00:34 +08:00
2022-06-08 16:47:57 +08:00
NNI_IMAGES='ccr.ccs.tencentyun.com/cube-studio/nni:20211003'
2021-08-17 17:00:34 +08:00
ISTIO_INGRESS_DOMAIN = os.getenv('ISTIO_INGRESS_DOMAIN','local.com') # 泛化域名尾缀可以和HOST不一致没有泛化域名对应的功能没法使用
2021-11-25 18:04:34 +08:00
K8S_DASHBOARD_CLUSTER = '/k8s/dashboard/cluster/' #
K8S_DASHBOARD_PIPELINE = '/k8s/dashboard/pipeline/'
2021-08-17 17:00:34 +08:00
2021-11-25 18:04:34 +08:00
PIPELINE_URL = '/pipeline/#/'
KATIB_URL = '/katib/#'
2021-08-17 17:00:34 +08:00
# 这两部分功能需要泛化域名。没有泛化域名此部分功能受限
KFSERVING_DOMAIN = 'kfserving.%s' % ISTIO_INGRESS_DOMAIN
SERVICE_DOMAIN = 'service.%s' % ISTIO_INGRESS_DOMAIN
2021-09-07 18:04:14 +08:00
# 多行分割内网特定host
HOSTALIASES='''
127.0.0.1 localhost
'''
2021-10-14 17:21:29 +08:00
SERVICE_EXTERNAL_IP=[]
2021-08-17 17:00:34 +08:00
ALL_LINKS=[
{
"label":"Minio",
"name":"minio",
2021-11-25 18:04:34 +08:00
"url":"/minio/public/"
2021-08-17 17:00:34 +08:00
},
{
"label": "K8s Dashboard",
"name": "kubernetes_dashboard",
2021-11-25 18:04:34 +08:00
"url": "/k8s/dashboard/cluster/#/pod?namespace=infra"
2021-08-17 17:00:34 +08:00
},
{
"label":"Grafana",
"name":"grafana",
2021-11-25 18:04:34 +08:00
"url": '/grafana/' # 访问grafana的域名地址
2021-08-17 17:00:34 +08:00
}
]
2022-06-08 16:47:57 +08:00
TFSERVING_IMAGES=['ccr.ccs.tencentyun.com/cube-studio/serving:1.11.0','ccr.ccs.tencentyun.com/cube-studio/serving:1.11.0-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:1.12.0','ccr.ccs.tencentyun.com/cube-studio/serving:1.12.0-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:1.13.0','ccr.ccs.tencentyun.com/cube-studio/serving:1.13.0-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:1.14.0','ccr.ccs.tencentyun.com/cube-studio/serving:1.14.0-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:2.0.0','ccr.ccs.tencentyun.com/cube-studio/serving:2.0.0-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:2.1.4','ccr.ccs.tencentyun.com/cube-studio/serving:2.1.4-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:2.2.3','ccr.ccs.tencentyun.com/cube-studio/serving:2.2.3-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:2.3.4','ccr.ccs.tencentyun.com/cube-studio/serving:2.3.4-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:2.4.3','ccr.ccs.tencentyun.com/cube-studio/serving:2.4.3-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:2.5.2','ccr.ccs.tencentyun.com/cube-studio/serving:2.5.2-gpu','ccr.ccs.tencentyun.com/cube-studio/serving:2.6.0','ccr.ccs.tencentyun.com/cube-studio/serving:2.6.0-gpu']
TRITONSERVER_IMAGES=['ccr.ccs.tencentyun.com/cube-studio/tritonserver:21.12-py3','ccr.ccs.tencentyun.com/cube-studio/tritonserver:21.09-py3']
TORCHSERVER_IMAGES=['ccr.ccs.tencentyun.com/cube-studio/torchserve:0.5.0-cpu','ccr.ccs.tencentyun.com/cube-studio/torchserve:0.5.0-gpu','ccr.ccs.tencentyun.com/cube-studio/torchserve:0.4.2-cpu','ccr.ccs.tencentyun.com/cube-studio/torchserve:0.4.2-gpu']
2022-02-26 22:36:40 +08:00
INFERNENCE_IMAGES={
"tfserving":TFSERVING_IMAGES,
'torch-server':TORCHSERVER_IMAGES,
2022-06-08 16:47:57 +08:00
'onnxruntime':['ccr.ccs.tencentyun.com/cube-studio/onnxruntime:v1.0.0','ccr.ccs.tencentyun.com/cube-studio/onnxruntime:server-latest'],
2022-02-26 22:36:40 +08:00
'triton-server':TRITONSERVER_IMAGES,
'kfserving-tf': TFSERVING_IMAGES,
"kfserving-torch":TORCHSERVER_IMAGES,
"kfserving-triton": TRITONSERVER_IMAGES,
2022-06-08 16:47:57 +08:00
'kfserving-sklearn': ['ccr.ccs.tencentyun.com/cube-studio/sklearnserver:v0.7.0'],
'kfserving-xgboost': ['ccr.ccs.tencentyun.com/cube-studio/sklearnserver:v0.7.0'],
'kfserving-lightgbm':['ccr.ccs.tencentyun.com/cube-studio/lgbserver:v0.7.0'],
'kfserving-paddle':['ccr.ccs.tencentyun.com/cube-studio/paddleserver:v0.7.0']
2022-02-26 22:36:40 +08:00
}
INFERNENCE_COMMAND={
"tfserving":"/usr/bin/tf_serving_entrypoint.sh --model_config_file=/config/models.config --monitoring_config_file=/config/monitoring.config --platform_config_file=/config/platform.config",
"torch-server":"torchserve --start --model-store /models/$model_name/ --models $model_name=$model_name.mar --foreground --log-config /config/log4j2.xml",
"onnxruntime":"onnxruntime_server --model_path /models/",
"triton-server":'tritonserver --model-repository=/models/ --strict-model-config=true --log-verbose=1'
}
INFERNENCE_ENV={
"tfserving":['TF_CPP_VMODULE=http_server=1','TZ=Asia/Shanghai'],
}
INFERNENCE_PORTS={
"tfserving":'8501',
"torch-server":"8080,8081",
"onnxruntime":"8001",
"triton-server":"8000,8002"
}
INFERNENCE_METRICS={
"tfserving":'8501:/metrics',
"torch-server":"8082:/metrics",
"triton-server":"8002:/metrics"
}
INFERNENCE_HEALTH={
"tfserving":'8501:/v1/models/$model_name/versions/$model_version/metadata',
"torch-server":"8080:/ping",
"triton-server":"8000:/v2/health/ready"
}
GRAFANA_TASK_PATH='/grafana/d/pod-info/pod-info?var-pod='
GRAFANA_SERVICE_PATH="/grafana/d/istio-service/istio-service?var-namespace=service&var-service="
GRAFANA_CLUSTER_PATH="/grafana/d/all-node/all-node?var-org="
GRAFANA_NODE_PATH="/grafana/d/node/node?var-node="
GPU_CHOICE_ARR = ["0","1(T4)","2(T4)","1(V100)","2(V100)","1(A100)","2(A100)","1(vgpu)"]
GPU_CHOICES = [[choice,choice] for choice in GPU_CHOICE_ARR]
# 当前控制器所在的集群
ENVIRONMENT=get_env_variable('ENVIRONMENT','DEV').lower()
2021-10-14 17:21:29 +08:00
# 所有训练集群的信息
2021-08-17 17:00:34 +08:00
CLUSTERS={
2021-10-14 17:21:29 +08:00
# 和project expand里面的名称一致
2021-09-07 18:04:14 +08:00
"dev":{
"NAME":"dev",
"KUBECONFIG":'/home/myapp/kubeconfig/dev-kubeconfig',
2022-05-13 20:47:22 +08:00
"K8S_DASHBOARD_CLUSTER":'/k8s/dashboard/cluster/',
2021-10-14 17:21:29 +08:00
"KFP_HOST": 'http://ml-pipeline.kubeflow:8888',
2022-05-13 20:47:22 +08:00
"PIPELINE_URL": '/pipeline/#/',
# "JUPYTER_DOMAIN":"kubeflow.local.com", # 如果没有域名就用* 有域名就配置成 HOST
# "NNI_DOMAIN":'kubeflow.local.com' # 如果没有域名就用* 有域名就配置成 HOST
2022-02-26 22:36:40 +08:00
}
2021-08-17 17:00:34 +08:00
}