提交 a02df9d8 authored 作者: junjie's avatar junjie

feat(backend):spark cache 初步实现

上级 a11eb21c
package io.dataease.commons.utils;
import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import java.util.concurrent.*;
/**
* @Author gin
* @Date 2021/4/13 4:08 下午
*/
public class CommonThreadPool {
private int corePoolSize = 10;
private int maxQueueSize = 10;
private int keepAliveSeconds = 600;
private ScheduledThreadPoolExecutor scheduledThreadPoolExecutor;
@PostConstruct
public void init() {
scheduledThreadPoolExecutor = new ScheduledThreadPoolExecutor(corePoolSize);
scheduledThreadPoolExecutor.setKeepAliveTime(keepAliveSeconds, TimeUnit.SECONDS);
}
@PreDestroy
public void shutdown() {
if (scheduledThreadPoolExecutor != null) {
scheduledThreadPoolExecutor.shutdown();
}
}
/**
* 线程池是否可用(实际队列数是否小于最大队列数)
*
* @return true为可用,false不可用
*/
public boolean available() {
return scheduledThreadPoolExecutor.getQueue().size() <= maxQueueSize;
}
/**
* 添加任务,不强制限制队列数
*
* @param task 任务
*/
public void addTask(Runnable task) {
scheduledThreadPoolExecutor.execute(task);
}
/**
* 添加延迟执行任务,不强制限制队列数
*
* @param task 任务
* @param delay 延迟时间
* @param unit 延迟时间单位
*/
public void scheduleTask(Runnable task, long delay, TimeUnit unit) {
scheduledThreadPoolExecutor.schedule(task, delay, unit);
}
/**
* 添加任务和超时时间(超时时间内未执行完的任务将被终止并移除线程池,防止任务执行时间过长而占用线程池)
*
* @param task 任务
* @param timeOut 超时时间
* @param timeUnit 超时时间单位
*/
public void addTask(Runnable task, long timeOut, TimeUnit timeUnit) {
scheduledThreadPoolExecutor.execute(() -> {
ExecutorService executorService = Executors.newSingleThreadExecutor();
try {
Future future = executorService.submit(task);
future.get(timeOut, timeUnit); // 此行会阻塞,直到任务执行完或超时
} catch (TimeoutException timeoutException) {
LogUtil.getLogger().error("timeout to execute task", timeoutException);
} catch (Exception exception) {
LogUtil.getLogger().error("failed to execute task", exception);
} finally {
if (!executorService.isShutdown()) {
executorService.shutdown();
}
}
});
}
public void setCorePoolSize(int corePoolSize) {
this.corePoolSize = corePoolSize;
}
public void setMaxQueueSize(int maxQueueSize) {
this.maxQueueSize = maxQueueSize;
}
public void setKeepAliveSeconds(int keepAliveSeconds) {
this.keepAliveSeconds = keepAliveSeconds;
}
}
package io.dataease.config;
import com.fit2cloud.autoconfigure.QuartzAutoConfiguration;
import io.dataease.commons.utils.CommonThreadPool;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;
......@@ -33,31 +34,20 @@ public class CommonConfig {
return configuration;
}
@Bean
@ConditionalOnMissingBean
public JavaSparkContext javaSparkContext() {
public SparkSession javaSparkSession() {
SparkSession spark = SparkSession.builder()
.appName(env.getProperty("spark.appName", "DataeaseJob"))
.master(env.getProperty("spark.master", "local[*]"))
.config("spark.scheduler.mode", "FAIR")
.getOrCreate();
JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
return sc;
return spark;
}
@Bean
@ConditionalOnMissingBean
public SQLContext sqlContext(JavaSparkContext javaSparkContext) {
SQLContext sqlContext = new SQLContext(javaSparkContext);
sqlContext.setConf("spark.sql.shuffle.partitions", env.getProperty("spark.sql.shuffle.partitions", "1"));
sqlContext.setConf("spark.default.parallelism", env.getProperty("spark.default.parallelism", "1"));
return sqlContext;
}
@Bean
@ConditionalOnMissingBean
public KettleFileRepository kettleFileRepository()throws Exception{
public KettleFileRepository kettleFileRepository() throws Exception {
KettleEnvironment.init();
KettleFileRepository repository = new KettleFileRepository();
KettleFileRepositoryMeta kettleDatabaseMeta = new KettleFileRepositoryMeta("KettleFileRepository", "repo",
......@@ -65,4 +55,13 @@ public class CommonConfig {
repository.init(kettleDatabaseMeta);
return repository;
}
@Bean(destroyMethod = "shutdown")
public CommonThreadPool resourcePoolThreadPool() {
CommonThreadPool commonThreadPool = new CommonThreadPool();
commonThreadPool.setCorePoolSize(20);
commonThreadPool.setMaxQueueSize(100);
commonThreadPool.setKeepAliveSeconds(3600);
return commonThreadPool;
}
}
......@@ -6,12 +6,14 @@ import io.dataease.service.ScheduleService;
import io.dataease.service.dataset.DataSetTableTaskService;
import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.util.List;
@Component
@Order(value = 1)
public class AppStartListener implements ApplicationListener<ApplicationReadyEvent> {
@Resource
private ScheduleService scheduleService;
......
package io.dataease.listener;
import io.dataease.base.domain.DatasetTable;
import io.dataease.base.domain.DatasetTableExample;
import io.dataease.base.domain.DatasetTableField;
import io.dataease.base.domain.DatasetTableFieldExample;
import io.dataease.base.mapper.DatasetTableFieldMapper;
import io.dataease.base.mapper.DatasetTableMapper;
import io.dataease.commons.utils.CommonBeanFactory;
import io.dataease.commons.utils.CommonThreadPool;
import io.dataease.service.dataset.DataSetTableFieldsService;
import io.dataease.service.spark.SparkCalc;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;
import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.core.annotation.Order;
import org.springframework.core.env.Environment;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.util.List;
@Component
@Order(value = 2)
public class AppStartReadHBaseListener implements ApplicationListener<ApplicationReadyEvent> {
@Resource
private CommonThreadPool commonThreadPool;
@Resource
private SparkCalc sparkCalc;
@Resource
private Environment env; // 保存了配置文件的信息
@Resource
private DatasetTableMapper datasetTableMapper;
@Resource
private DataSetTableFieldsService dataSetTableFieldsService;
@Override
public void onApplicationEvent(ApplicationReadyEvent applicationReadyEvent) {
System.out.println("================= Read HBase start =================");
// 项目启动,从数据集中找到定时抽取的表,从HBase中读取放入缓存
DatasetTableExample datasetTableExample = new DatasetTableExample();
datasetTableExample.createCriteria().andModeEqualTo(1);
List<DatasetTable> datasetTables = datasetTableMapper.selectByExampleWithBLOBs(datasetTableExample);
for (DatasetTable table : datasetTables) {
commonThreadPool.addTask(() -> {
try {
List<DatasetTableField> fields = dataSetTableFieldsService.getFieldsByTableId(table.getId());
sparkCalc.getHBaseDataAndCache(table.getId(), fields);
} catch (Exception e) {
e.printStackTrace();
}
});
}
}
}
......@@ -4,6 +4,7 @@ import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import io.dataease.base.domain.*;
import io.dataease.base.mapper.ChartViewMapper;
import io.dataease.base.mapper.DatasetTableFieldMapper;
import io.dataease.commons.utils.AuthUtils;
import io.dataease.commons.utils.BeanUtils;
import io.dataease.controller.request.chart.ChartViewRequest;
......@@ -16,6 +17,7 @@ import io.dataease.dto.chart.ChartViewDTO;
import io.dataease.dto.chart.ChartViewFieldDTO;
import io.dataease.dto.chart.Series;
import io.dataease.dto.dataset.DataTableInfoDTO;
import io.dataease.service.dataset.DataSetTableFieldsService;
import io.dataease.service.dataset.DataSetTableService;
import io.dataease.service.spark.SparkCalc;
import org.apache.commons.collections4.CollectionUtils;
......@@ -41,6 +43,8 @@ public class ChartViewService {
private DatasourceService datasourceService;
@Resource
private SparkCalc sparkCalc;
@Resource
private DataSetTableFieldsService dataSetTableFieldsService;
public ChartViewWithBLOBs save(ChartViewWithBLOBs chartView) {
checkName(chartView);
......@@ -121,9 +125,9 @@ public class ChartViewService {
}
data = datasourceProvider.getData(datasourceRequest);
} else if (table.getMode() == 1) {// 抽取
// DataTableInfoDTO dataTableInfoDTO = new Gson().fromJson(table.getInfo(), DataTableInfoDTO.class);
// String tableName = dataTableInfoDTO.getTable() + "-" + table.getDataSourceId();// todo hBase table name maybe change
data = sparkCalc.getData(table.getId(), xAxis, yAxis, "tmp_" + view.getId().split("-")[0]);
// 获取数据集de字段
List<DatasetTableField> fields = dataSetTableFieldsService.getFieldsByTableId(table.getId());
data = sparkCalc.getData(table.getId(), fields, xAxis, yAxis, "tmp_" + view.getId().split("-")[0]);
}
// 图表组件可再扩展
......
......@@ -60,4 +60,10 @@ public class DataSetTableFieldsService {
datasetTableFieldExample.createCriteria().andIdIn(ids);
return datasetTableFieldMapper.selectByExample(datasetTableFieldExample);
}
public List<DatasetTableField> getFieldsByTableId(String id) {
DatasetTableFieldExample datasetTableFieldExample = new DatasetTableFieldExample();
datasetTableFieldExample.createCriteria().andTableIdEqualTo(id);
return datasetTableFieldMapper.selectByExample(datasetTableFieldExample);
}
}
package io.dataease.service.spark;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import java.util.HashMap;
import java.util.Map;
/**
* @Author gin
* @Date 2021/4/13 12:32 下午
*/
public class CacheUtil {
private static CacheUtil cacheUtil;
private static Map<String, Dataset<Row>> cacheMap;
private CacheUtil(){
cacheMap = new HashMap<String, Dataset<Row>>();
}
public static CacheUtil getInstance(){
if (cacheUtil == null){
cacheUtil = new CacheUtil();
}
return cacheUtil;
}
/**
* 添加缓存
* @param key
* @param obj
*/
public void addCacheData(String key,Dataset<Row> obj){
cacheMap.put(key,obj);
}
/**
* 取出缓存
* @param key
* @return
*/
public Dataset<Row> getCacheData(String key){
return cacheMap.get(key);
}
/**
* 清楚缓存
* @param key
*/
public void removeCacheData(String key){
cacheMap.remove(key);
}
}
package io.dataease.service.spark;
import io.dataease.base.domain.DatasetTableField;
import io.dataease.commons.utils.CommonBeanFactory;
import io.dataease.dto.chart.ChartViewFieldDTO;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Result;
......@@ -42,21 +44,56 @@ public class SparkCalc {
@Resource
private Environment env; // 保存了配置文件的信息
public List<String[]> getData(String hTable, List<ChartViewFieldDTO> xAxis, List<ChartViewFieldDTO> yAxis, String tmpTable) throws Exception {
public List<String[]> getData(String hTable, List<DatasetTableField> fields, List<ChartViewFieldDTO> xAxis, List<ChartViewFieldDTO> yAxis, String tmpTable) throws Exception {
// Spark Context
SparkSession spark = CommonBeanFactory.getBean(SparkSession.class);
JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
// Spark SQL Context
// SQLContext sqlContext = CommonBeanFactory.getBean(SQLContext.class);
SQLContext sqlContext = new SQLContext(sparkContext);
sqlContext.setConf("spark.sql.shuffle.partitions", env.getProperty("spark.sql.shuffle.partitions", "1"));
sqlContext.setConf("spark.default.parallelism", env.getProperty("spark.default.parallelism", "1"));
Dataset<Row> dataFrame = CacheUtil.getInstance().getCacheData(hTable);
if (ObjectUtils.isEmpty(dataFrame)) {
dataFrame = getHBaseDataAndCache(sparkContext, sqlContext, hTable, fields);
}
dataFrame.createOrReplaceTempView(tmpTable);
Dataset<Row> sql = sqlContext.sql(getSQL(xAxis, yAxis, tmpTable));
// transform
List<String[]> data = new ArrayList<>();
List<Row> list = sql.collectAsList();
for (Row row : list) {
String[] r = new String[row.length()];
for (int i = 0; i < row.length(); i++) {
r[i] = row.get(i) == null ? "null" : row.get(i).toString();
}
data.add(r);
}
return data;
}
public Dataset<Row> getHBaseDataAndCache(String hTable, List<DatasetTableField> fields) throws Exception {
// Spark Context
SparkSession spark = CommonBeanFactory.getBean(SparkSession.class);
JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
// Spark SQL Context
// SQLContext sqlContext = CommonBeanFactory.getBean(SQLContext.class);
SQLContext sqlContext = new SQLContext(sparkContext);
sqlContext.setConf("spark.sql.shuffle.partitions", env.getProperty("spark.sql.shuffle.partitions", "1"));
sqlContext.setConf("spark.default.parallelism", env.getProperty("spark.default.parallelism", "1"));
return getHBaseDataAndCache(sparkContext, sqlContext, hTable, fields);
}
public Dataset<Row> getHBaseDataAndCache(JavaSparkContext sparkContext, SQLContext sqlContext, String hTable, List<DatasetTableField> fields) throws Exception {
Scan scan = new Scan();
scan.addFamily(column_family.getBytes());
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
String scanToString = new String(Base64.getEncoder().encode(proto.toByteArray()));
// Spark Context
// JavaSparkContext sparkContext = CommonBeanFactory.getBean(JavaSparkContext.class);
SparkSession spark = SparkSession.builder()
.appName(env.getProperty("spark.appName", "DataeaseJob"))
.master(env.getProperty("spark.master", "local[*]"))
.config("spark.scheduler.mode", "FAIR")
.getOrCreate();
JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
// HBase config
// Configuration conf = CommonBeanFactory.getBean(Configuration.class);
org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
......@@ -73,7 +110,7 @@ public class SparkCalc {
while (tuple2Iterator.hasNext()) {
Result result = tuple2Iterator.next()._2;
List<Object> list = new ArrayList<>();
xAxis.forEach(x -> {
fields.forEach(x -> {
String l = Bytes.toString(result.getValue(column_family.getBytes(), x.getOriginName().getBytes()));
if (x.getDeType() == 0 || x.getDeType() == 1) {
list.add(l);
......@@ -89,22 +126,6 @@ public class SparkCalc {
list.add(Double.valueOf(l));
}
});
yAxis.forEach(y -> {
String l = Bytes.toString(result.getValue(column_family.getBytes(), y.getOriginName().getBytes()));
if (y.getDeType() == 0 || y.getDeType() == 1) {
list.add(l);
} else if (y.getDeType() == 2) {
if (StringUtils.isEmpty(l)) {
l = "0";
}
list.add(Long.valueOf(l));
} else if (y.getDeType() == 3) {
if (StringUtils.isEmpty(l)) {
l = "0.0";
}
list.add(Double.valueOf(l));
}
});
iterator.add(RowFactory.create(list.toArray()));
}
return iterator.iterator();
......@@ -112,7 +133,7 @@ public class SparkCalc {
List<StructField> structFields = new ArrayList<>();
// struct顺序要与rdd顺序一致
xAxis.forEach(x -> {
fields.forEach(x -> {
if (x.getDeType() == 0 || x.getDeType() == 1) {
structFields.add(DataTypes.createStructField(x.getOriginName(), DataTypes.StringType, true));
} else if (x.getDeType() == 2) {
......@@ -121,40 +142,15 @@ public class SparkCalc {
structFields.add(DataTypes.createStructField(x.getOriginName(), DataTypes.DoubleType, true));
}
});
yAxis.forEach(y -> {
if (y.getDeType() == 0 || y.getDeType() == 1) {
structFields.add(DataTypes.createStructField(y.getOriginName(), DataTypes.StringType, true));
} else if (y.getDeType() == 2) {
structFields.add(DataTypes.createStructField(y.getOriginName(), DataTypes.LongType, true));
} else if (y.getDeType() == 3) {
structFields.add(DataTypes.createStructField(y.getOriginName(), DataTypes.DoubleType, true));
}
});
StructType structType = DataTypes.createStructType(structFields);
// Spark SQL Context
// SQLContext sqlContext = CommonBeanFactory.getBean(SQLContext.class);
SQLContext sqlContext = new SQLContext(sparkContext);
sqlContext.setConf("spark.sql.shuffle.partitions", env.getProperty("spark.sql.shuffle.partitions", "1"));
sqlContext.setConf("spark.default.parallelism", env.getProperty("spark.default.parallelism", "1"));
Dataset<Row> dataFrame = sqlContext.createDataFrame(rdd, structType);
dataFrame.createOrReplaceTempView(tmpTable);
Dataset<Row> sql = sqlContext.sql(getSQL(xAxis, yAxis, tmpTable));
// transform
List<String[]> data = new ArrayList<>();
List<Row> list = sql.collectAsList();
for (Row row : list) {
String[] r = new String[row.length()];
for (int i = 0; i < row.length(); i++) {
r[i] = row.get(i) == null ? "null" : row.get(i).toString();
}
data.add(r);
}
return data;
Dataset<Row> dataFrame = sqlContext.createDataFrame(rdd, structType).persist();
CacheUtil.getInstance().addCacheData(hTable, dataFrame);
dataFrame.count();
return dataFrame;
}
private String getSQL(List<ChartViewFieldDTO> xAxis, List<ChartViewFieldDTO> yAxis, String table) {
public String getSQL(List<ChartViewFieldDTO> xAxis, List<ChartViewFieldDTO> yAxis, String table) {
// 字段汇总 排序等
String[] field = yAxis.stream().map(y -> "CAST(" + y.getSummary() + "(" + y.getOriginName() + ") AS DECIMAL(20,2)) AS _" + y.getSummary() + "_" + y.getOriginName()).toArray(String[]::new);
String[] group = xAxis.stream().map(ChartViewFieldDTO::getOriginName).toArray(String[]::new);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论