data_upload.sh
#!/bin/bash
#CSV_FILE=${HOME}/data/Montoldre/2010_2014.txt
CSV_FILE=$1
JENA_HOME=${HOME}/opt/share/jena-fuseki-1.0.0
TPL_DIR=${HOME}/work/weather/src/ # Location of templates
SPARQL_UPDATE="http://ontology.irstea.fr:3030/weather/update"
echo "----- ( 1/10) Basic transformations"
# --- Delete the first two lines of the csv file
cat $CSV_FILE | sed '1,2d' > tmp0.txt
# --- Replace cardinal points of wind directions to angles.
mkfifo wnd_{1,2,3}
cat tmp0.txt | cut -f1-8 > wnd_1 &
cat tmp0.txt | cut -f10- > wnd_3 &
cat tmp0.txt | cut -f9 | sed -e """
s/^N$/0.0/;s/^NNE$/22.5/;s/^NE$/45.0/;s/^ENE$/67.5/
s/^E$/90.0/;s/^ESE$/112.5/;s/^SE$/135.0/;s/^SSE$/157.5/
s/^S$/180.0/;s/^SSW$/202.5/;s/^SW$/225.0/;s/^WSW$/247.5/
s/^W$/270.0/;s/^WNW$/292.5/;s/^NW$/315.0/;s/^NNW$/337.5/
""" > wnd_2 &
paste wnd_{1,2,3} > tmp1.txt
rm wnd_{1,2,3}
echo "----- ( 2/10) Date extraction"
# --- Dates
# Dates extraction and format conversion
cat tmp1.txt | cut -f1,2 | tr -s "\t" " " |
sed -e 's/^\(..\)\/\(..\)\/\(..\) /20\3\/\2\/\1 /' > tmp2.txt
echo "----- ( 3/10) Date format conversion"
# The date format has also to be changed in csv file
cat tmp1.txt | sed -e 's/\//;/g;s/:/;/g' > tmp1_bis.txt
echo "----- ( 4/10) Timestamp calculation"
# Timestamp calculation
TZ='Europe/Paris'; while read d; do
date --date="$d" '+%Y-%m-%dT%H:%M:00%z' | sed -e 's/\+\([0-9]\{2\}\)/+\1:/' ;
done < tmp2.txt > ts2.txt
# Calculation of time intervals beween two measurements
echo "30" > tmp3.txt # The first measure have been done with 30 min interval
unset d1
while read d2; do
if [[ -v d1 ]]; then
e2=`date --date="$d2" +%s`;
e1=`date --date="$d1" +%s`;
echo $(( ($e2 - $e1)/60 ));
fi;
d1="$d2";
done < tmp2.txt >> tmp3.txt
echo "----- ( 5/10) Extraction of the begining of intervals"
# Now, the timestamp of the begining of the intervall is calculated
TZ='Europe/Paris'; paste tmp{3,2}.txt | while read tt; do
mm=`echo "$tt" | cut -f1`; dd=`echo "$tt" | cut -f2,3`;
ee=$(( `date --date "$dd" +%s` - ( $mm * 60 ) ));
date --date=@$ee '+%Y-%m-%dT%H:%M:00%z' | sed -e 's/\+\([0-9]\{2\}\)/+\1:/' ;
done > ts3.txt
echo "----- ( 6/10) Data cleaning"
# The result is added to the original csv file
paste tmp3.txt ts2.txt ts3.txt tmp1_bis.txt > tmp4.txt
# Some lines are deleted from the original file :
# - One line has a negative duration (probably a buffer written twice)
# - The first measure after a crash of the system (station + computer
# writing data) is deleted because we cannot know if the measures
# consider the whole time intervall or not (and certainly not).
# This occured 5 times from 2010 to 2013
cat tmp4.txt | grep -v "^-" | while read d; do
e=`echo "$d" | cut -f1`;
if [[ $e -lt 100 ]]; then echo "$d"; fi; done > tmp5.txt
# - Lines written twice (or more - probably consequence of a wrong
# buffering) are also deleted (15 lines are deleted).
# The tabs are replaced with semicolons (its amazing that csvfix
# doesn't seem to like tabs as a separator - maybee I don't use
# it well).
cat tmp5.txt | sort | uniq | tr -s '\t' ';' > tmp6.txt
echo "----- ( 7/10) Timestamps creation"
# --- Before we upload measurement values, we have to create timestamps.
#
# So, we get the timestamps of the end of intervals
cat tmp4.txt | cut -f2 > ts4.txt
# and also the begining of intervalls.
cat tmp4.txt | cut -f3 >> ts4.txt
# Doubles are deleted
cat ts4.txt | sort | uniq > ts5.txt
# and year, month, days, hours, ... are extracted from timestamps.
cat ts5.txt | sed -e 's/[-T:+]/;/g' | paste -d\; ts5.txt - > ts6.txt
echo "----- ( 8/10) Applying templates"
# --- Templates
#
# We need to write headers to ttl files
echo """
@prefix : <http://ontology.irstea.fr/weather/ontology/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ssn: <http://purl.oclc.org/NET/ssnx/ssn#> .
@prefix cf-feature: <http://purl.oclc.org/NET/ssnx/cf/cf-feature#> .
@prefix cf-property: <http://purl.oclc.org/NET/ssnx/cf/cf-property#> .
@prefix unit: <http://purl.oclc.org/NET/ssnx/qu/unit#> .
@prefix time: <http://www.w3.org/2006/time#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix DUL: <http://www.loa-cnr.it/ontologies/DUL.owl#> .
""" > data_header.ttl
echo """@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix time: <http://www.w3.org/2006/time#> .
""" > timestamp_header.ttl
cp timestamp_header.ttl durations.ttl
for i in $(cat tmp6.txt | cut -d\; -f1 | uniq | sort | uniq); do
echo """<http://ontology.irstea.fr/weather/resource/durationDescription/P${i}M>
a time:DurationDescription ;
time:minutes \"${i}\"^^xsd:decimal .""" >> durations.ttl; done
# The process is splitted by month to avoid overload of the sparql endpoint
for i in `cat tmp6.txt | cut -d\; -f5,6 | sort | uniq`; do
mm=`echo $i | cut -d\; -f1`; aa=`echo $i | cut -d\; -f2`; j="${mm}-${aa}";
echo '=======>' $mm-20$aa
# - Dates first
cp timestamp_header.ttl timestamp_${j}.ttl # We start with just the headers
cat ts6.txt | grep ^20${aa}-${mm} |
csvfix template -tf ${TPL_DIR}/timestamp.tpl.ttl -sep ';' - >> timestamp_${j}.ttl
echo ' ' `cat timestamp_${j}.ttl | grep instant | wc -l` timestamps.
# - Then data
cat tmp6.txt | sed -e '/^[0-9]\+;20'${aa}'-'${mm}'-/!d' > tmp_${j}.txt
echo ' ' `cat tmp_${j}.txt | wc -l` lines.
cp data_header.ttl data_${j}.ttl # Headers
csvfix template -tf ${TPL_DIR}/data.tpl.ttl -sep ';' tmp_${j}.txt > tmp_${j}.ttl
# Deletion of empty measures (values are replaced by "---").
# This happends for example for wind direction when wind speed is 0.0
cat tmp_${j}.ttl | sed -e '/./{H;$!d;}' -e 'x;/"---*"/d;' >> data_${j}.ttl;
rm tmp_${j}.txt tmp_${j}.ttl; # Delete temporary files
done
rm {timestamp,data}_header.ttl
echo "----- ( 9/10) Send to SPARQL endpoint"
#exit 0 # <- I prefer do the sending to the SPARQL end point manually.
# --- Data sending to SPARQL endpoint
# Deletion of all data ; this is to be done because we needed many tries
# but has not to be done anymore.
echo " Delete old data"
$JENA_HOME/s-update --service $SPARQL_UPDATE 'DROP ALL'
# Durations
echo " Durations"
echo "INSERT DATA" > durations.sparql
echo "{" >> durations.sparql
rapper -i turtle durations.ttl >> durations.sparql
echo "}" >> durations.sparql
$JENA_HOME/s-update --service $SPARQL_UPDATE --update durations.sparql
# Sensors
echo " Sensors"
echo "INSERT DATA" > sensors.sparql
echo "{" >> sensors.sparql
rapper -i turtle ../sensors.ttl >> sensors.sparql
echo "}" >> sensors.sparql
$JENA_HOME/s-update --service $SPARQL_UPDATE --update sensors.sparql
# Timestamps
echo " Timestamps"
for i in timestamp_*.ttl; do
j=`echo "$i" | sed -e 's/ttl$/sparql/'`
echo '--->' $j
echo "INSERT DATA" > "$j"
echo "{" >> "$j"
rapper -i turtle "$i" >> "$j"
echo "}" >> "$j"
$JENA_HOME/s-update --service $SPARQL_UPDATE --update "$j"
done
# Then data
echo " Data"
for i in data_*.ttl; do
j=`echo "$i" | sed -e 's/ttl$/sparql/'`
echo '--->' $j
echo "INSERT DATA" > "$j"
echo "{" >> "$j"
rapper -i turtle "$i" >> "$j"
echo "}" >> "$j"
$JENA_HOME/s-update --service $SPARQL_UPDATE --update "$j"
done
# --- Cleaning up
echo "----- (10/10) Cleaning up"
rm -fv data_??-??.ttl tmp*.t?? ts?.txt *.sparql
#CSV_FILE=${HOME}/data/Montoldre/2010_2014.txt
CSV_FILE=$1
JENA_HOME=${HOME}/opt/share/jena-fuseki-1.0.0
TPL_DIR=${HOME}/work/weather/src/ # Location of templates
SPARQL_UPDATE="http://ontology.irstea.fr:3030/weather/update"
echo "----- ( 1/10) Basic transformations"
# --- Delete the first two lines of the csv file
cat $CSV_FILE | sed '1,2d' > tmp0.txt
# --- Replace cardinal points of wind directions to angles.
mkfifo wnd_{1,2,3}
cat tmp0.txt | cut -f1-8 > wnd_1 &
cat tmp0.txt | cut -f10- > wnd_3 &
cat tmp0.txt | cut -f9 | sed -e """
s/^N$/0.0/;s/^NNE$/22.5/;s/^NE$/45.0/;s/^ENE$/67.5/
s/^E$/90.0/;s/^ESE$/112.5/;s/^SE$/135.0/;s/^SSE$/157.5/
s/^S$/180.0/;s/^SSW$/202.5/;s/^SW$/225.0/;s/^WSW$/247.5/
s/^W$/270.0/;s/^WNW$/292.5/;s/^NW$/315.0/;s/^NNW$/337.5/
""" > wnd_2 &
paste wnd_{1,2,3} > tmp1.txt
rm wnd_{1,2,3}
echo "----- ( 2/10) Date extraction"
# --- Dates
# Dates extraction and format conversion
cat tmp1.txt | cut -f1,2 | tr -s "\t" " " |
sed -e 's/^\(..\)\/\(..\)\/\(..\) /20\3\/\2\/\1 /' > tmp2.txt
echo "----- ( 3/10) Date format conversion"
# The date format has also to be changed in csv file
cat tmp1.txt | sed -e 's/\//;/g;s/:/;/g' > tmp1_bis.txt
echo "----- ( 4/10) Timestamp calculation"
# Timestamp calculation
TZ='Europe/Paris'; while read d; do
date --date="$d" '+%Y-%m-%dT%H:%M:00%z' | sed -e 's/\+\([0-9]\{2\}\)/+\1:/' ;
done < tmp2.txt > ts2.txt
# Calculation of time intervals beween two measurements
echo "30" > tmp3.txt # The first measure have been done with 30 min interval
unset d1
while read d2; do
if [[ -v d1 ]]; then
e2=`date --date="$d2" +%s`;
e1=`date --date="$d1" +%s`;
echo $(( ($e2 - $e1)/60 ));
fi;
d1="$d2";
done < tmp2.txt >> tmp3.txt
echo "----- ( 5/10) Extraction of the begining of intervals"
# Now, the timestamp of the begining of the intervall is calculated
TZ='Europe/Paris'; paste tmp{3,2}.txt | while read tt; do
mm=`echo "$tt" | cut -f1`; dd=`echo "$tt" | cut -f2,3`;
ee=$(( `date --date "$dd" +%s` - ( $mm * 60 ) ));
date --date=@$ee '+%Y-%m-%dT%H:%M:00%z' | sed -e 's/\+\([0-9]\{2\}\)/+\1:/' ;
done > ts3.txt
echo "----- ( 6/10) Data cleaning"
# The result is added to the original csv file
paste tmp3.txt ts2.txt ts3.txt tmp1_bis.txt > tmp4.txt
# Some lines are deleted from the original file :
# - One line has a negative duration (probably a buffer written twice)
# - The first measure after a crash of the system (station + computer
# writing data) is deleted because we cannot know if the measures
# consider the whole time intervall or not (and certainly not).
# This occured 5 times from 2010 to 2013
cat tmp4.txt | grep -v "^-" | while read d; do
e=`echo "$d" | cut -f1`;
if [[ $e -lt 100 ]]; then echo "$d"; fi; done > tmp5.txt
# - Lines written twice (or more - probably consequence of a wrong
# buffering) are also deleted (15 lines are deleted).
# The tabs are replaced with semicolons (its amazing that csvfix
# doesn't seem to like tabs as a separator - maybee I don't use
# it well).
cat tmp5.txt | sort | uniq | tr -s '\t' ';' > tmp6.txt
echo "----- ( 7/10) Timestamps creation"
# --- Before we upload measurement values, we have to create timestamps.
#
# So, we get the timestamps of the end of intervals
cat tmp4.txt | cut -f2 > ts4.txt
# and also the begining of intervalls.
cat tmp4.txt | cut -f3 >> ts4.txt
# Doubles are deleted
cat ts4.txt | sort | uniq > ts5.txt
# and year, month, days, hours, ... are extracted from timestamps.
cat ts5.txt | sed -e 's/[-T:+]/;/g' | paste -d\; ts5.txt - > ts6.txt
echo "----- ( 8/10) Applying templates"
# --- Templates
#
# We need to write headers to ttl files
echo """
@prefix : <http://ontology.irstea.fr/weather/ontology/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ssn: <http://purl.oclc.org/NET/ssnx/ssn#> .
@prefix cf-feature: <http://purl.oclc.org/NET/ssnx/cf/cf-feature#> .
@prefix cf-property: <http://purl.oclc.org/NET/ssnx/cf/cf-property#> .
@prefix unit: <http://purl.oclc.org/NET/ssnx/qu/unit#> .
@prefix time: <http://www.w3.org/2006/time#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix DUL: <http://www.loa-cnr.it/ontologies/DUL.owl#> .
""" > data_header.ttl
echo """@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix time: <http://www.w3.org/2006/time#> .
""" > timestamp_header.ttl
cp timestamp_header.ttl durations.ttl
for i in $(cat tmp6.txt | cut -d\; -f1 | uniq | sort | uniq); do
echo """<http://ontology.irstea.fr/weather/resource/durationDescription/P${i}M>
a time:DurationDescription ;
time:minutes \"${i}\"^^xsd:decimal .""" >> durations.ttl; done
# The process is splitted by month to avoid overload of the sparql endpoint
for i in `cat tmp6.txt | cut -d\; -f5,6 | sort | uniq`; do
mm=`echo $i | cut -d\; -f1`; aa=`echo $i | cut -d\; -f2`; j="${mm}-${aa}";
echo '=======>' $mm-20$aa
# - Dates first
cp timestamp_header.ttl timestamp_${j}.ttl # We start with just the headers
cat ts6.txt | grep ^20${aa}-${mm} |
csvfix template -tf ${TPL_DIR}/timestamp.tpl.ttl -sep ';' - >> timestamp_${j}.ttl
echo ' ' `cat timestamp_${j}.ttl | grep instant | wc -l` timestamps.
# - Then data
cat tmp6.txt | sed -e '/^[0-9]\+;20'${aa}'-'${mm}'-/!d' > tmp_${j}.txt
echo ' ' `cat tmp_${j}.txt | wc -l` lines.
cp data_header.ttl data_${j}.ttl # Headers
csvfix template -tf ${TPL_DIR}/data.tpl.ttl -sep ';' tmp_${j}.txt > tmp_${j}.ttl
# Deletion of empty measures (values are replaced by "---").
# This happends for example for wind direction when wind speed is 0.0
cat tmp_${j}.ttl | sed -e '/./{H;$!d;}' -e 'x;/"---*"/d;' >> data_${j}.ttl;
rm tmp_${j}.txt tmp_${j}.ttl; # Delete temporary files
done
rm {timestamp,data}_header.ttl
echo "----- ( 9/10) Send to SPARQL endpoint"
#exit 0 # <- I prefer do the sending to the SPARQL end point manually.
# --- Data sending to SPARQL endpoint
# Deletion of all data ; this is to be done because we needed many tries
# but has not to be done anymore.
echo " Delete old data"
$JENA_HOME/s-update --service $SPARQL_UPDATE 'DROP ALL'
# Durations
echo " Durations"
echo "INSERT DATA" > durations.sparql
echo "{" >> durations.sparql
rapper -i turtle durations.ttl >> durations.sparql
echo "}" >> durations.sparql
$JENA_HOME/s-update --service $SPARQL_UPDATE --update durations.sparql
# Sensors
echo " Sensors"
echo "INSERT DATA" > sensors.sparql
echo "{" >> sensors.sparql
rapper -i turtle ../sensors.ttl >> sensors.sparql
echo "}" >> sensors.sparql
$JENA_HOME/s-update --service $SPARQL_UPDATE --update sensors.sparql
# Timestamps
echo " Timestamps"
for i in timestamp_*.ttl; do
j=`echo "$i" | sed -e 's/ttl$/sparql/'`
echo '--->' $j
echo "INSERT DATA" > "$j"
echo "{" >> "$j"
rapper -i turtle "$i" >> "$j"
echo "}" >> "$j"
$JENA_HOME/s-update --service $SPARQL_UPDATE --update "$j"
done
# Then data
echo " Data"
for i in data_*.ttl; do
j=`echo "$i" | sed -e 's/ttl$/sparql/'`
echo '--->' $j
echo "INSERT DATA" > "$j"
echo "{" >> "$j"
rapper -i turtle "$i" >> "$j"
echo "}" >> "$j"
$JENA_HOME/s-update --service $SPARQL_UPDATE --update "$j"
done
# --- Cleaning up
echo "----- (10/10) Cleaning up"
rm -fv data_??-??.ttl tmp*.t?? ts?.txt *.sparql