data_upload.sh

#!/bin/bash

#CSV_FILE=${HOME}/data/Montoldre/2010_2014.txt
CSV_FILE=$1
JENA_HOME=${HOME}/opt/share/jena-fuseki-1.0.0
TPL_DIR=${HOME}/work/weather/src/     # Location of templates
SPARQL_UPDATE="http://ontology.irstea.fr:3030/weather/update"

echo "----- ( 1/10) Basic transformations"

# --- Delete the first two lines of the csv file
cat $CSV_FILE | sed '1,2d' > tmp0.txt

# --- Replace cardinal points of wind directions to angles.
mkfifo wnd_{1,2,3}
cat tmp0.txt | cut -f1-8 > wnd_1 &
cat tmp0.txt | cut -f10- > wnd_3 &
cat tmp0.txt | cut -f9 | sed -e """
s/^N$/0.0/;s/^NNE$/22.5/;s/^NE$/45.0/;s/^ENE$/67.5/
s/^E$/90.0/;s/^ESE$/112.5/;s/^SE$/135.0/;s/^SSE$/157.5/
s/^S$/180.0/;s/^SSW$/202.5/;s/^SW$/225.0/;s/^WSW$/247.5/
s/^W$/270.0/;s/^WNW$/292.5/;s/^NW$/315.0/;s/^NNW$/337.5/
"
"" > wnd_2 &
paste wnd_{1,2,3} > tmp1.txt
rm wnd_{1,2,3}

echo "----- ( 2/10) Date extraction"

# --- Dates
#     Dates extraction and format conversion
cat tmp1.txt  | cut  -f1,2 | tr -s "\t" " " |
   sed -e 's/^\(..\)\/\(..\)\/\(..\) /20\3\/\2\/\1 /' > tmp2.txt

echo "----- ( 3/10) Date format conversion"

#     The date format has also to be changed in csv file
cat tmp1.txt  | sed -e 's/\//;/g;s/:/;/g' > tmp1_bis.txt

echo "----- ( 4/10) Timestamp calculation"

#     Timestamp calculation
TZ='Europe/Paris';  while read d; do
  date --date="$d" '+%Y-%m-%dT%H:%M:00%z' | sed -e 's/\+\([0-9]\{2\}\)/+\1:/' ;
done < tmp2.txt > ts2.txt

#     Calculation of time intervals beween two measurements
echo "30" > tmp3.txt # The first measure have been done with 30 min interval
unset d1
while read d2; do
  if [[ -v d1 ]]; then
    e2=`date --date="$d2" +%s`;
    e1=`date --date="$d1" +%s`;
    echo $(( ($e2 - $e1)/60 ));
  fi;
  d1="$d2";
done < tmp2.txt >> tmp3.txt

echo "----- ( 5/10) Extraction of the begining of intervals"

#     Now, the timestamp of the begining of the intervall is calculated
TZ='Europe/Paris'; paste tmp{3,2}.txt | while read tt; do
  mm=`echo "$tt" | cut -f1`; dd=`echo "$tt" | cut -f2,3`;
  ee=$(( `date --date "$dd" +%s` - ( $mm * 60 ) ));
  date --date=@$ee '+%Y-%m-%dT%H:%M:00%z' | sed -e 's/\+\([0-9]\{2\}\)/+\1:/' ;
done > ts3.txt

echo "----- ( 6/10) Data cleaning"

#     The result is added to the original csv file
paste tmp3.txt ts2.txt ts3.txt tmp1_bis.txt > tmp4.txt

#     Some lines are deleted from the original file :
#     - One line has a negative duration (probably a buffer written twice)
#     - The first measure after a crash of the system (station + computer
#       writing data) is deleted because we cannot know if the measures
#       consider the whole time intervall or not (and certainly not).
#       This occured 5 times from 2010 to 2013
cat tmp4.txt | grep -v "^-" | while read d; do
  e=`echo "$d" |  cut -f1`;
  if [[ $e -lt 100 ]]; then echo "$d"; fi; done > tmp5.txt

#     - Lines written twice (or more - probably consequence of a wrong
#       buffering) are also deleted (15 lines are deleted).
#       The tabs are replaced with semicolons (its amazing that csvfix
#       doesn't seem to like tabs as a separator - maybee I don't use
#       it well).
cat tmp5.txt | sort | uniq |  tr -s '\t' ';' > tmp6.txt

echo "----- ( 7/10) Timestamps creation"

# --- Before we upload measurement values, we have to create timestamps.
#
#     So, we get the timestamps of the end of intervals
cat tmp4.txt | cut -f2 > ts4.txt
#     and also the begining of intervalls.
cat tmp4.txt | cut -f3 >> ts4.txt
#     Doubles are deleted
cat ts4.txt | sort | uniq > ts5.txt
#     and year, month, days, hours, ... are extracted from timestamps.
cat ts5.txt | sed -e 's/[-T:+]/;/g' | paste -d\; ts5.txt - > ts6.txt

echo "----- ( 8/10) Applying templates"

# --- Templates
#
#     We need to write headers to ttl files
echo """
@prefix : <http://ontology.irstea.fr/weather/ontology/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ssn: <http://purl.oclc.org/NET/ssnx/ssn#> .
@prefix cf-feature: <http://purl.oclc.org/NET/ssnx/cf/cf-feature#> .
@prefix cf-property: <http://purl.oclc.org/NET/ssnx/cf/cf-property#> .
@prefix unit: <http://purl.oclc.org/NET/ssnx/qu/unit#> .
@prefix time: <http://www.w3.org/2006/time#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix DUL: <http://www.loa-cnr.it/ontologies/DUL.owl#> .
"
"" > data_header.ttl

echo """@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix time: <http://www.w3.org/2006/time#> .
"
"" > timestamp_header.ttl

cp timestamp_header.ttl durations.ttl
for i in $(cat tmp6.txt | cut -d\; -f1 | uniq | sort | uniq); do
  echo """<http://ontology.irstea.fr/weather/resource/durationDescription/P${i}M>
   a time:DurationDescription ;
   time:minutes \"${i}\"^^xsd:decimal ."
"" >> durations.ttl; done

#     The process is splitted by month to avoid overload of the sparql endpoint
for i in `cat tmp6.txt |  cut -d\; -f5,6 | sort | uniq`; do
  mm=`echo $i | cut -d\; -f1`; aa=`echo $i | cut -d\; -f2`; j="${mm}-${aa}";
  echo '=======>' $mm-20$aa

  # - Dates first
  cp timestamp_header.ttl timestamp_${j}.ttl # We start with just the headers
  cat ts6.txt | grep ^20${aa}-${mm} |
      csvfix template -tf ${TPL_DIR}/timestamp.tpl.ttl -sep ';' - >> timestamp_${j}.ttl
   echo '        ' `cat timestamp_${j}.ttl | grep instant | wc -l` timestamps.

  # - Then data
  cat tmp6.txt | sed -e '/^[0-9]\+;20'${aa}'-'${mm}'-/!d' > tmp_${j}.txt
  echo '        ' `cat tmp_${j}.txt | wc -l` lines.
  cp data_header.ttl data_${j}.ttl # Headers
  csvfix template -tf ${TPL_DIR}/data.tpl.ttl -sep ';' tmp_${j}.txt > tmp_${j}.ttl

  #   Deletion of empty measures (values are replaced by "---").
  #   This happends for example for wind direction when wind speed is 0.0
  cat tmp_${j}.ttl | sed -e '/./{H;$!d;}' -e 'x;/"---*"/d;' >> data_${j}.ttl;

  rm tmp_${j}.txt tmp_${j}.ttl; # Delete temporary files
done
rm {timestamp,data}_header.ttl


echo "----- ( 9/10) Send to SPARQL endpoint"

#exit 0 # <- I prefer do the sending to the SPARQL end point manually.

# --- Data sending to SPARQL endpoint
#     Deletion of all data ; this is to be done because we needed many tries
#     but has not to be done anymore.
echo "      Delete old data"
$JENA_HOME/s-update --service $SPARQL_UPDATE 'DROP ALL'

#     Durations
echo "      Durations"
echo "INSERT DATA" > durations.sparql
echo "{" >> durations.sparql
rapper -i turtle durations.ttl >> durations.sparql
echo "}" >> durations.sparql
$JENA_HOME/s-update --service $SPARQL_UPDATE --update durations.sparql

#     Sensors
echo "      Sensors"
echo "INSERT DATA" > sensors.sparql
echo "{" >> sensors.sparql
rapper -i turtle ../sensors.ttl >> sensors.sparql
echo "}" >> sensors.sparql
$JENA_HOME/s-update --service $SPARQL_UPDATE --update sensors.sparql

#     Timestamps
echo "      Timestamps"
for i in timestamp_*.ttl; do
  j=`echo "$i" | sed -e 's/ttl$/sparql/'`
  echo '--->' $j
  echo "INSERT DATA" > "$j"
  echo "{" >> "$j"
  rapper -i turtle "$i" >> "$j"
  echo "}" >> "$j"
  $JENA_HOME/s-update --service $SPARQL_UPDATE --update "$j"
done

#     Then data
echo "      Data"
for i in data_*.ttl; do
  j=`echo "$i" | sed -e 's/ttl$/sparql/'`
  echo '--->' $j
  echo "INSERT DATA" > "$j"
  echo "{" >> "$j"
  rapper -i turtle "$i" >> "$j"
  echo "}" >> "$j"
  $JENA_HOME/s-update --service $SPARQL_UPDATE --update "$j"
done

# --- Cleaning up
echo "----- (10/10) Cleaning up"
rm -fv data_??-??.ttl tmp*.t?? ts?.txt *.sparql

Weather data

CASO

BSV

ppdo

AgronomicTaxon

AgroTechnoPôle

GIEEA

edit SideBar

Blix theme adapted by David Gilbert, powered by PmWiki