You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
222 lines
10 KiB
222 lines
10 KiB
#!/bin/bash
|
|
set -e
|
|
echo "******************"
|
|
echo "** rollout load **"
|
|
echo "******************"
|
|
PWD=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
|
parentPWD="$(dirname "$PWD")"
|
|
source $PWD/../functions.sh
|
|
|
|
GEN_DATA_SCALE=$1
|
|
EXPLAIN_ANALYZE=$2
|
|
RANDOM_DISTRIBUTION=$3
|
|
MULTI_USER_COUNT=$4
|
|
SINGLE_USER_ITERATIONS=$5
|
|
VERSION=$6
|
|
UTIL=$7
|
|
|
|
if [[ "$GEN_DATA_SCALE" == "" || "$EXPLAIN_ANALYZE" == "" || "$RANDOM_DISTRIBUTION" == "" || "$MULTI_USER_COUNT" == "" || "$SINGLE_USER_ITERATIONS" == "" || "$VERSION" == "" ]]; then
|
|
echo "You must provide the scale as a parameter in terms of Gigabytes, true/false to run queries with EXPLAIN ANALYZE option, true/false to use random distrbution, multi-user count, the number of sql iterations, and the name of the database under test."
|
|
echo "Example: ./rollout.sh 100 false false 5 1 exasol"
|
|
exit 1
|
|
fi
|
|
|
|
step=load
|
|
init_log $step
|
|
|
|
if [ -f "$PWD/batch_load_$VERSION.sql" ]; then
|
|
rm -f $PWD/batch_load_$VERSION.sql
|
|
fi
|
|
touch $PWD/batch_load_$VERSION.sql
|
|
mylogfile=$PWD/../log/data_load.log
|
|
|
|
PARALLEL=$(lscpu --parse=cpu | grep -v "#" | wc -l)
|
|
if [ "$PARALLEL" -gt "8" ]; then
|
|
PARALLEL=$(($PARALLEL - 5))
|
|
fi
|
|
echo "parallel: $PARALLEL"
|
|
|
|
# Laden der Daten in die Datenbank. Dazu werden, sofern möglich, alle Anweisungen in eine Datei kopiert und dann gesammelt ausgeführt. Ansonsten wird jede Ladeanweisung einzeln ausgeführt.
|
|
# Ein satz von Dateien (*.postgresql.*.sql) wird benutzt um darüber zu iterieren und an die Tabellennamen zu kommen.
|
|
# Abschließend werden gegebenenfalls noch Constraints (primary und foreign keys) erstellt, die beim Laden der Daten gestört hätten.
|
|
# Für postgres und ignite werden noch ein zuästzlich Index auf der lineitem Tabelle erstellt, da diese sonst einige Queries in keiner vernünftigen Zeit schaffen würden.
|
|
# Um andere Daten zu laden, müssen Dateien analog zu den existierenden angelegt werden.
|
|
if [[ "$VERSION" == "exasol" ]]; then
|
|
filter="postgresql"
|
|
|
|
for i in $(ls $PWD/*.$filter.*.sql); do
|
|
id=$(echo $i | awk -F '.' '{print $1}')
|
|
table_name=$(echo $i | awk -F '.' '{print $3}')
|
|
for p in $(seq 1 $PARALLEL); do
|
|
filename=$(echo $PWD/../01_gen_data/data/psqldata_$p/$table_name.tbl*)
|
|
if [[ -f $filename && -s $filename ]]; then
|
|
start_log
|
|
filename="'""$filename""'"
|
|
stmt='import INTO tpch.'$table_name' FROM LOCAL CSV FILE '$filename' COLUMN SEPARATOR = '\''|'\'' ROW SEPARATOR = '\''LF'\'' ;'
|
|
echo $stmt >> $PWD/batch_load_$VERSION.sql
|
|
fi
|
|
done
|
|
done
|
|
|
|
echo "loading data may take awhile"
|
|
STARTTIME=$(date +%s%N)
|
|
$parentPWD/EXAplus-7.1.2/exaplus -c localhost/$UTIL:8563 -u sys -p exasol -f $PWD/batch_load_$VERSION.sql > $mylogfile
|
|
ENDTIME=$(date +%s%N)
|
|
grep "Elapsed: " $mylogfile | awk -F '.' '{SUM += $2}' >> $mylogfile
|
|
echo "It takes $((($ENDTIME - $STARTTIME)/1000000)) milliseconds to complete data loading." >> $mylogfile
|
|
$parentPWD/EXAplus-7.1.2/exaplus -c localhost/$UTIL:8563 -u sys -p exasol -f $PWD/17.exasol.constraints.sql
|
|
|
|
elif [ "$VERSION" == "postgresql" ] || [ "$VERSION" == "gpdb" ]; then
|
|
filter="postgresql"
|
|
|
|
STARTTIME=$(date +%s%N)
|
|
for i in $(ls $PWD/*.$filter.*.sql); do
|
|
id=$(echo $i | awk -F '.' '{print $1}')
|
|
schema_name=$(echo $i | awk -F '.' '{print $2}')
|
|
table_name=$(echo $i | awk -F '.' '{print $3}')
|
|
for p in $(seq 1 $PARALLEL); do
|
|
filename=$(echo $PWD/../01_gen_data/data/psqldata_$p/$table_name.tbl*)
|
|
if [[ -f $filename && -s $filename ]]; then
|
|
start_log
|
|
filename="'""$filename""'"
|
|
echo "filename is $filename"
|
|
psql -e -h localhost -p 5439 -U p -d benchmark -v ON_ERROR_STOP=1 -c '\copy tpch.'$table_name' FROM '$filename' delimiter '\''|'\'' csv;'
|
|
fi
|
|
done
|
|
done
|
|
ENDTIME=$(date +%s%N)
|
|
echo "It takes $((($ENDTIME - $STARTTIME)/1000000)) milliseconds to complete data loading." >> $mylogfile
|
|
psql -h localhost -p 5439 -U p -d benchmark -v ON_ERROR_STOP=1 -q -a -P pager=off -f $PWD/17.postgres.constraints.sql
|
|
|
|
elif [ "$VERSION" == "ignite" ]; then
|
|
filter="ignite"
|
|
|
|
# Wenn eine beliebige Daten-Datei ein "|" enthält, dann enthalten das alle Dateien und diese müssen bearbeitet werden
|
|
# da bei ignite kein separator angegeben werden kann. Dazu werden zunächst alle , in ; geändert, um anschließend | in , ändern zu können
|
|
line=$(head -n 1 $parentPWD/01_gen_data/data/psqldata_1/region.tbl)
|
|
if [[ $line == *"|"* ]]; then
|
|
echo "transforming data for ignite"
|
|
for p in $(seq 1 $PARALLEL); do
|
|
echo "set $p"
|
|
sed -i -- 's/,/;/g; s/|/,/g' $parentPWD/01_gen_data/data/psqldata_$p/*
|
|
done
|
|
fi
|
|
batch_load_ignite=$PWD/batch_load_ignite.sql
|
|
touch $batch_load_ignite
|
|
for i in $(ls $PWD/*.$filter.*.sql); do
|
|
id=$(echo $i | awk -F '.' '{print $1}')
|
|
table_name=$(echo $i | awk -F '.' '{print $3}')
|
|
for p in $(seq 1 $PARALLEL); do
|
|
filename=$(echo $parentPWD/01_gen_data/data/psqldata_$p/$table_name.tbl*)
|
|
if [[ -f $filename && -s $filename ]]; then
|
|
start_log
|
|
line=$(head -n 1 $i)
|
|
echo "COPY FROM '$filename' $line" >> $batch_load_ignite
|
|
fi
|
|
done
|
|
done
|
|
echo "loading data may take awhile"
|
|
STARTTIME=$(date +%s%N)
|
|
source $PWD/../apache-ignite-2.11.0-bin/bin/sqlline.sh -u jdbc:ignite:thin://127.0.0.1/ -n ignite -p ignite --run=$batch_load_ignite
|
|
ENDTIME=$(date +%s%N)
|
|
grep "Elapsed: " $mylogfile | awk -F '.' '{SUM += $2}' >> $mylogfile
|
|
echo "It takes $((($ENDTIME - $STARTTIME)/1000000)) milliseconds to complete data loading." >> $mylogfile
|
|
rm $batch_load_ignite
|
|
source $PWD/../apache-ignite-2.11.0-bin/bin/sqlline.sh -u jdbc:ignite:thin://127.0.0.1/ -n ignite -p ignite --run=$PWD/17.ignite.constraints.sql
|
|
elif [ "$VERSION" == "cratedb" ]; then
|
|
# filter wird später nur benutzt, um über einen Satz von .sql-Dateien zu iterieren und über deren Namen an die Tabellennamen zu kommen
|
|
filter="postgresql"
|
|
|
|
if [ "1" == "1" ]; then
|
|
# Wenn eine beliebige Daten-Datei ein "|" enthält, dann enthalten das alle Dateien und diese müssen bearbeitet werden
|
|
line=$(head -n 1 $parentPWD/01_gen_data/data/psqldata_1/region.tbl)
|
|
if [[ $line == *"|"* ]]; then
|
|
echo "transforming data for cratedb"
|
|
for p in $(seq 1 $PARALLEL); do
|
|
echo "Bearbeiten von $p"
|
|
# Da bei cratedb kein separator angegeben werden kann, werden zunächst alle , in ; geändert, um anschließend | in , ändern zu können.
|
|
sed -i -- 's/,/;/g; s/|/,/g' $parentPWD/01_gen_data/data/psqldata_$p/*
|
|
# Alle Datumsangaben müssen zwischen " stehen
|
|
sed -i -r 's/([0-9]{4}-[0-9]{2}-[0-9]{2})/"\1"/g' $parentPWD/01_gen_data/data/psqldata_$p/*
|
|
done
|
|
|
|
# Jede Daten-Datei muss die Spaltennamen enthalten.
|
|
# Außerdem muss jede Datei die .csv-Endung haben.
|
|
echo "insert table headers"
|
|
for i in $(ls $PWD/*.$filter.*.sql); do
|
|
id=$(echo $i | awk -F '.' '{print $1}')
|
|
table_name=$(echo $i | awk -F '.' '{print $3}')
|
|
for p in $(seq 1 $PARALLEL); do
|
|
sourcefilename=$(echo $parentPWD/01_gen_data/data/psqldata_$p/$table_name.tbl*)
|
|
if [[ -f $sourcefilename && -s $sourcefilename ]]; then
|
|
case "$table_name" in
|
|
lineitem) firstline="l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment,dummy";;
|
|
supplier) firstline="s_suppkey,s_name,s_address,s_nationkey,s_phone,s_acctbal,s_comment,dummy";;
|
|
region) firstline="r_regionkey,r_name,r_comment,dummy";;
|
|
partsupp) firstline="ps_partkey,ps_suppkey,ps_availqty,ps_supplycost,ps_comment,dummy";;
|
|
part) firstline="p_partkey,p_name,p_mfgr,p_brand,p_type,p_size,p_container,p_retailprice,p_comment,dummy";;
|
|
orders) firstline="o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment,dummy";;
|
|
nation) firstline="n_nationkey,n_name,n_regionkey,n_comment,dummy";;
|
|
customer) firstline="c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment,dummy";;
|
|
esac
|
|
sed -i "1s/^/$firstline\n/" $sourcefilename
|
|
mv $sourcefilename $parentPWD/01_gen_data/data/psqldata_$p/$table_name.csv
|
|
fi
|
|
|
|
done
|
|
done
|
|
fi
|
|
# Nach Bearbeitung der Daten-Dateien müssen diese in den Container der Masternode kopiert werden.
|
|
echo "tar -czvf data.tar.gz -C $PWD/../01_gen_data/data ."
|
|
tar -czvf data.tar.gz -C $PWD/../01_gen_data/data .
|
|
echo "docker cp data.tar.gz crate1:/data/data.tar.gz"
|
|
docker cp $parentPWD/data.tar.gz crate1:/data/data.tar.gz
|
|
echo "docker exec -it crate1 tar -xzvf /data/data.tar.gz"
|
|
docker exec -it crate1 tar -xzvf /data/data.tar.gz
|
|
fi
|
|
|
|
STARTTIME=$(date +%s%N)
|
|
for i in $(ls $PWD/*.$filter.*.sql); do
|
|
id=$(echo $i | awk -F '.' '{print $1}')
|
|
table_name=$(echo $i | awk -F '.' '{print $3}')
|
|
for p in $(seq 1 $PARALLEL); do
|
|
sourcefilename=$(echo $PWD/../01_gen_data/data/psqldata_$p/$table_name.csv)
|
|
filename=$(echo psqldata_$p/$table_name.csv)
|
|
echo "$filename"
|
|
if [[ -f $sourcefilename && -s $sourcefilename ]]; then
|
|
start_log
|
|
echo "COPY tpch.$table_name FROM 'file:///data/$filename';"
|
|
$parentPWD/crash-cli/crash --hosts "localhost:$UTIL" -c "COPY tpch.$table_name FROM 'file:///data/$filename';"
|
|
fi
|
|
done
|
|
done
|
|
echo "loading data may take awhile"
|
|
ENDTIME=$(date +%s%N)
|
|
grep "Elapsed: " $mylogfile | awk -F '.' '{SUM += $2}' >> $mylogfile
|
|
echo "It takes $((($ENDTIME - $STARTTIME)/1000000)) milliseconds to complete data loading" >> $mylogfile
|
|
elif [ "$VERSION" == "mariadbcs" ]; then
|
|
filter="postgresql"
|
|
|
|
STARTTIME=$(date +%s%N)
|
|
for i in $(ls $PWD/*.$filter.*.sql); do
|
|
id=$(echo $i | awk -F '.' '{print $1}')
|
|
schema_name=$(echo $i | awk -F '.' '{print $2}')
|
|
table_name=$(echo $i | awk -F '.' '{print $3}')
|
|
for p in $(seq 1 $PARALLEL); do
|
|
filename=$(echo $PWD/../01_gen_data/data/psqldata_$p/$table_name.tbl*)
|
|
if [[ -f $filename && -s $filename ]]; then
|
|
start_log
|
|
filename="'""$filename""'"
|
|
echo "filename is $filename"
|
|
mariadb --protocol tcp --host localhost -u nsc --password=mariadbcs -e 'LOAD DATA LOCAL INFILE '$filename' INTO TABLE tpch.'$table_name' FIELDS TERMINATED BY '\''|'\'';'
|
|
fi
|
|
done
|
|
done
|
|
ENDTIME=$(date +%s%N)
|
|
echo "It takes $((($ENDTIME - $STARTTIME)/1000000)) milliseconds to complete data loading." >> $mylogfile
|
|
else
|
|
echo "ERROR: Unsupported VERSION $VERSION!"
|
|
exit 1
|
|
fi
|
|
|
|
end_step $step
|
|
|