cube-studio/myapp/init/init-etl-pipeline.json

[
  {
    "name": "dau",
    "describe": "dau计算",
    "config": {
        "alert_user": "admin"
    },
    "workflow": "airflow",
    "dag_json": {
      "cos导入hdfs-1686184253953": {
        "label": "数据导入",
        "location": [
          304,
          96
        ],
        "color": {
          "color": "rgba(0,170,200,1)",
          "bg": "rgba(0,170,200,0.02)"
        },
        "template": "cos导入hdfs",
        "template-group": "出库入库",
        "task-config": {
          "crontab": "1 1 * * *",
          "selfDepend": "单实例运行",
          "ResourceGroup": "default",
          "alert_user": "admin,",
          "timeout": "0",
          "retry": "0",
          "hdfsPath": "hdfs://xx/xxx",
          "cosPath": "/xx/${YYYYMMDD}.tar.gz",
          "ifNeedZip": "1",
          "label": "数据导入"
        },
        "upstream": [],
        "task_id": 1
      },
      "hdfs入库至hive-1686184263002": {
        "label": "数据入库",
        "location": [
          304,
          224
        ],
        "color": {
          "color": "rgba(0,170,200,1)",
          "bg": "rgba(0,170,200,0.02)"
        },
        "template": "hdfs入库至hive",
        "template-group": "出库入库",
        "task-config": {
          "crontab": "1 1 * * *",
          "selfDepend": "单实例运行",
          "ResourceGroup": "default",
          "alert_user": "admin,",
          "timeout": "0",
          "retry": "0",
          "charSet": "UTF-8",
          "databaseName": "",
          "tableName": "",
          "delimiter": "9",
          "failedOnZeroWrited": "1",
          "partitionType": "P_${YYYYMMDDHH}",
          "sourceFilePath": "",
          "sourceFileNames": "*",
          "sourceColumnNames": "",
          "targetColumnNames": "",
          "loadMode": "TRUNCATE",
          "label": "数据入库"
        },
        "upstream": [
          "cos导入hdfs-1686184253953"
        ],
        "task_id": 2
      },
      "SQL-1686184276800": {
        "label": "局部特征计算",
        "location": [
          -16,
          352
        ],
        "color": {
          "color": "rgba(0,200,153,1)",
          "bg": "rgba(0,200,153,0.02)"
        },
        "template": "SQL",
        "template-group": "数据计算",
        "task-config": {
          "crontab": "1 1 * * *",
          "selfDepend": "单实例运行",
          "ResourceGroup": "default",
          "alert_user": "admin,",
          "timeout": "0",
          "retry": "0",
          "filterSQL": "\n    --库名，替换下面的demo_database\n    use demo_database;\n\n    --建表语句，替换下面的demo_table，修改字段。一定要加“if not exists”，这样使只在第一次运行时建表\n    CREATE TABLE if not exists demo_table(\n        qimei36 STRING COMMENT '唯一设备ID',\n        userid_id STRING COMMENT '用户id（各app的用户id）',\n        device_id STRING COMMENT '设备id（各app的device_id）',\n        ftime INT COMMENT '数据分区时间 格式：yyyymmdd'\n    )\n    PARTITION BY LIST( ftime )          --定义分区字段，替换掉ftime。\n    (\n        PARTITION p_20220323 VALUES IN ( 20220323 ),       --初始分区，分区名替换p_20220323，分区值替换20220323\n        PARTITION default\n    )\n    STORED AS ORCFILE COMPRESS;\n\n    -- 分区，根据时间参数新建分区。\n    alter table demo_table drop partition (p_${YYYYMMDD});\n    alter table demo_table add partition p_${YYYYMMDD} values in (${YYYYMMDD});\n\n    -- 写入，用你的sql逻辑替换。\n    insert table demo_table\n    select * from other_db::other_table partition(p_${YYYYMMDD}) t;\n    ",
          "special_para": "set hive.exec.parallel = true;set hive.execute.engine=spark;set hive.multi.join.use.hive=false;set hive.spark.failed.retry=false;",
          "label": "局部特征计算"
        },
        "upstream": [
          "hdfs入库至hive-1686184263002"
        ],
        "task_id": 3
      },
      "SparkScala-1686184279367": {
        "label": "局部特征计算",
        "location": [
          304,
          352
        ],
        "color": {
          "color": "rgba(0,200,153,1)",
          "bg": "rgba(0,200,153,0.02)"
        },
        "template": "SparkScala",
        "template-group": "数据计算",
        "task-config": {
          "crontab": "1 1 * * *",
          "selfDepend": "单实例运行",
          "ResourceGroup": "default",
          "alert_user": "admin,",
          "timeout": "0",
          "retry": "0",
          "jar_path": "",
          "className": "",
          "files": "",
          "programSpecificParams": "",
          "options": "",
          "dynamicAllocation": "1",
          "driver_memory": "2g",
          "num_executors": "4",
          "executor_memory": "2g",
          "executor_cores": "2",
          "task.main.timeout": "480",
          "task.check.timeout": "5",
          "label": "局部特征计算"
        },
        "upstream": [
          "hdfs入库至hive-1686184263002"
        ],
        "task_id": 4
      },
      "pyspark-1686184281148": {
        "label": "局部特征计算",
        "location": [
          608,
          352
        ],
        "color": {
          "color": "rgba(0,200,153,1)",
          "bg": "rgba(0,200,153,0.02)"
        },
        "template": "pyspark",
        "template-group": "数据计算",
        "task-config": {
          "crontab": "1 1 * * *",
          "selfDepend": "单实例运行",
          "ResourceGroup": "default",
          "alert_user": "admin,",
          "timeout": "0",
          "retry": "0",
          "py_script_path": "",
          "files": "",
          "pyFiles": "",
          "programSpecificParams": "",
          "options": "",
          "dynamicAllocation": 1,
          "driver_memory": "2g",
          "num_executors": 4,
          "executor_memory": "2g",
          "executor_cores": 2,
          "task.main.timeout": 480,
          "task.check.timeout": "5",
          "label": "局部特征计算"
        },
        "upstream": [
          "hdfs入库至hive-1686184263002"
        ],
        "task_id": 5
      },
      "hive出库至hdfs-1686184293917": {
        "label": "结果计算",
        "location": [
          304,
          496
        ],
        "color": {
          "color": "rgba(0,170,200,1)",
          "bg": "rgba(0,170,200,0.02)"
        },
        "template": "hive出库至hdfs",
        "template-group": "出库入库",
        "task-config": {
          "crontab": "1 1 * * *",
          "selfDepend": "单实例运行",
          "ResourceGroup": "default",
          "alert_user": "admin,",
          "timeout": "0",
          "retry": "0",
          "databaseName": "",
          "destCheckFileName": "",
          "destCheckFilePath": "",
          "destFileDelimiter": "9",
          "destFilePath": "",
          "filterSQL": "select t1,t2,t3 from your_table where imp_date=${YYYYMMDD}",
          "label": "结果计算"
        },
        "upstream": [
          "SQL-1686184276800",
          "pyspark-1686184281148",
          "SparkScala-1686184279367"
        ],
        "task_id": 6
      },
      "hdfs导入cos-1686184296749": {
        "label": "数据导出",
        "location": [
          304,
          608
        ],
        "color": {
          "color": "rgba(0,170,200,1)",
          "bg": "rgba(0,170,200,0.02)"
        },
        "template": "hdfs导入cos",
        "template-group": "出库入库",
        "task-config": {
          "crontab": "1 1 * * *",
          "selfDepend": "单实例运行",
          "ResourceGroup": "default",
          "alert_user": "admin,",
          "timeout": "0",
          "retry": "0",
          "hdfsPath": "hdfs://xx/xxx",
          "cosPath": "/xx/xx/${YYYYMMDD}.tar.gz",
          "ifNeedZip": "1",
          "label": "数据导出"
        },
        "upstream": [
          "hive出库至hdfs-1686184293917"
        ],
        "task_id": 7
      }
    }
  }
]