MEL install and database mapping

From Hiscom
Jump to: navigation, search


This page describes one approach to the process of installing a BioCase provider and mirroring a texpress database to use as the data provider. In summary, a single mysql table is used to represent the texpress data, while a number of abcd tables are created. Then there is a big SQL script that runs from a cron script that does the updating from the single table to the abcd tables.

MySQL mirror

The Texpress database is mirrored in the following single table mysql database:

   CREATE TABLE specimens (
       additcol VARCHAR(90) NULL,	# additional collectors
       altm INT NULL,			# altitude in metres (ft not needed)
       authorit VARCHAR(80) NULL,	# the species authority
       basio2 VARCHAR(66) NULL,	# basionym 2
       basio3 VARCHAR(66) NULL,	# basionym 3
       basionym VARCHAR(66) NULL,	# oldest basionym
       bru VARCHAR(30) NULL,		# TDWG level 4
       carpolco CHAR(2) NULL,		# carpological collection
       ciba CHAR(2) NULL, 		# ciobachrome
       class CHAR(2) NULL,		# Y if a dry sheet
       collcode CHAR(3) NULL,		# internal use for unknown collectors
       colldat_1 INT NULL,		# collection date day
       colldat_2 CHAR(3) NULL,		# colleciton month
       colldat_3 INT NULL,		# collection year
       collectr VARCHAR(65) NULL,	# collector
       colourtr CHAR(2) NULL,		# colour transparency
       confdat_1 INT NULL,		# day of confirmation
       confdat_2 CHAR(3) NULL,		# month of confirmation
       confdat_3 INT NULL,		# year of confirmation
       confirme VARCHAR(30) NULL,	# who confirmed specimen
       country VARCHAR(30) NULL,	# TDWG level 2
       cultivat CHAR(2) NULL,		# Cultivated specimen
       datecrea DATE NULL,		# date record created
       dateedit DATE NULL,		# date record last edited
       declat DOUBLE NULL,		# decimal lat created on fly
       declong DOUBLE NULL,		# decimal long created on fly
       depth INT NULL,
       detby VARCHAR(30) NULL,		# determiner
       detdat_1 INT NULL,		# determined on day DD
       detdat_2 CHAR(3) NULL,		# determined on month MMM
       detdat_3 INT NULL,		# determined on YEAR YYYY
       duplicat VARCHAR(135) NULL,	# destination of replicates
       duptypst VARCHAR(15) NULL,	# type status of dupliates
       exsiccat VARCHAR(25) NULL,	# who donated specimen
       extra CHAR(2) NULL,		# there is extra info on the specimen
       family VARCHAR(35) NULL,		
       genus VARCHAR(25) NULL,
       geonum TINYINT NULL,		# TDWG Level 1 number
       georeg VARCHAR(20) NULL,	# TDWG level 1
       gridrefe_1 CHAR(1) NULL,	# major grid
       gridrefe_2 TINYINT NULL,	# minor grid
       gridrefe_3 CHAR(1) NULL,	# major grid 2
       gridrefe_4 TINYINT NULL,	# minor grid 2
       habit TEXT NULL,
       habitat TEXT NULL,
       hcommonn VARCHAR(60) NULL,	# host common name
       HERBKEY_1 VARCHAR(10) NOT NULL,	# PRIMARY KEY
       HERBKEY_2 INT NOT NULL,		# PRIMARY KEY
       HERBKEY_3 CHAR(1) NOT NULL,	# PRIMARY KEY
       hgenus VARCHAR(30) NULL,	# host genus
       hinfrasp VARCHAR(35) NULL,	# host infraspecific name
       hortref CHAR(2) NULL,		# part of hort ref collection
       hostfami VARCHAR(35) NULL,	# host family
       hspecies VARCHAR(40) NULL,	# host species 
       illustra CHAR(2) NULL,		# an illustration with specimen
       infrank VARCHAR(5) NULL,	# infraspecific rank
       infraspn VARCHAR(50) NULL,	# infraspecific name
       infraspq VARCHAR(10) NULL,	# infraspecific qualifier
       infrauth VARCHAR(58) NULL,	# infraspecific authority
       intro CHAR(2) NULL,		# if species is introduced
       lat1_1 SMALLINT NULL,		# lat degrees
       lat1_2 TINYINT NULL,		# lat minutes
       lat1_3 TINYINT NULL,		# lat seconds
       lat1_4 CHAR(1) NULL,		# lat dir
       letter CHAR(2) NULL, 		# the jar size
       lichenla CHAR(2) NULL,		# if barcodes have been printed 
       locality TEXT NULL,
       long1_1 SMALLINT NULL,		# long degrees
       long1_2 TINYINT NULL,		# long minutes
       long1_3 TINYINT NULL,		# long seconds
       long1_4 CHAR(1) NULL,		# long direction
       melfamil SMALLINT NULL,		# the internal numbering of families
       mixednot VARCHAR(150) NULL,	# notes to distinguish parts of a mixed coll.
       multiple VARCHAR(150) NULL,	# notes on how this might relate to another rec
       native CHAR(2) NULL,		# is a native species
       neg CHAR(2) NULL,		# a photographic negative with specimen
       newloan_1 SMALLINT NULL,	# the year component of the loan number
       newloan_2 SMALLINT NULL,	# the number component of the loan number
       notes TEXT NULL,
       number VARCHAR(12) NULL,	# the collectors field number
       origherb CHAR(5) NULL, 		# which herb has the specimen for this image
       phenolog_1 CHAR(2) NULL, 	# FL if flowers
       phenolog_2 CHAR(2) NULL,        # FR if fruits
       phenolog_3 CHAR(2) NULL,        # BD if buds
       phenolog_4 CHAR(2) NULL,        # LL if leafless
       phenolog_5 CHAR(2) NULL,        # FE if fertile
       phenolog_6 CHAR(2) NULL,        # ST if sterile
       photo CHAR(2) NULL,		# a photo with specimen
       precisio TINYINT NULL,		# the precision of the geocode
       priornam VARCHAR(135) NULL,	# equiv to name comments
       prot3 VARCHAR(64) NULL,		# protologue 3
       proto2 VARCHAR(64) NULL,	# protologue 2
       protolog VARCHAR(100) NULL,	# the oldest protologue
       seedling VARCHAR(12) NULL,	# a cross refereced seedling number
       slide CHAR(2) NULL,		# if there is a slide
       slideno VARCHAR(10) NULL,	# the slide number
       source TINYINT NULL,		# source of collection
       species VARCHAR(50) NULL,
       spirit CHAR(2) NULL,		# if there is a spirit collection
       spiritco VARCHAR(10) NULL,	# the spirit number
       spqual VARCHAR(10) NULL,	# species qualifier
       state VARCHAR(28) NULL,		# TDWG level 3
       substrat VARCHAR(75) NULL,	# substrate
       supragrp CHAR(2) NULL,		# supra group F,D,M,A,B,H etc
       synonym VARCHAR(67) NULL,	# no longer used
       transferred DATETIME NULL,	# date transferred 
       typedby VARCHAR(8) NULL,	# who created record
       typeof VARCHAR(67) NULL,	# no longer used
       typest3 VARCHAR(38) NULL,	# type status 3
       typesta2 VARCHAR(38) NULL,	# type status 2
       typestat VARCHAR(20) NULL,	# oldest type status
       typey CHAR(2) NULL,		# if this is a type
       userid VARCHAR(8) NULL,		# who last edited record
       voucher CHAR(2) NULL,		# is a voucher for something
       INDEX (genus, species, infraspn),
       INDEX (family),
       INDEX (newloan_1, newloan_2),
       INDEX (supragrp),
       INDEX (detby),
       INDEX (confirme),
       INDEX (collectr, number),
       INDEX (colldat_3, colldat_2,colldat_1),
       INDEX (georeg),
       INDEX (country),
       INDEX (state),
       INDEX (bru),
       INDEX (declat, declong),
       INDEX (locality(200)),
       INDEX (habitat(200)),
       INDEX (habit(200)),
       INDEX (notes(200)),
       INDEX (phenolog_1),
       INDEX (phenolog_2),
       INDEX (phenolog_3),
       INDEX (phenolog_4),
       INDEX (phenolog_5),
       INDEX (phenolog_6),
       INDEX (typedby),
       INDEX (userid),
       INDEX (datecrea),
       INDEX (dateedit),
       PRIMARY KEY (HERBKEY_1, HERBKEY_2, HERBKEY_3)
   );

ABCD tables

The following tables were created to better reflect the structure of ABCD

abcd_area

CREATE TABLE abcd_area (
  area_name varchar(150) default NULL,
  area_class varchar(30) default NULL,
  unitID varchar(10) default NULL,
  areaID int(11) NOT NULL auto_increment,
  PRIMARY KEY  (areaID),
  KEY unitID (unitID)
)

abcd_biotope

CREATE TABLE abcd_biotope (
  unitID varchar(10) default NULL,
  parameter varchar(30) default NULL,
  value text,
  is_quantitative tinyint(4) default NULL,
  biotopeID int(11) NOT NULL auto_increment,
  PRIMARY KEY  (biotopeID),
  KEY unitID (unitID)
)

abcd_collector

CREATE TABLE abcd_collector (
  collector_name varchar(255) default NULL,
  sequence int(11) default NULL,
  primary_flag int(11) default NULL,
  unitID varchar(10) default NULL,
  collectorID int(11) NOT NULL auto_increment,
  PRIMARY KEY  (collectorID),
  KEY unitID (unitID)
)

abcd_higher_taxon

CREATE TABLE abcd_higher_taxon (
  htID int(11) NOT NULL auto_increment,
  name varchar(80) default NULL,
  rank varchar(20) default NULL,
  unitID varchar(10) default NULL,
  PRIMARY KEY  (htID),
  KEY unitID (unitID)
)

abcd_identification

CREATE TABLE abcd_identification (
  unitID varchar(10) default NULL,
  higher_taxon_name varchar(30) default NULL,
  higher_taxon_rank varchar(20) default NULL,
  scientific_name varchar(100) default NULL,
  genus varchar(30) default NULL,
  name_addendum varchar(50) default NULL,
  species varchar(30) default NULL,
  author_team varchar(30) default NULL,
  author_team_parenthesis varchar(30) default NULL,
  rank varchar(10) default NULL,
  infraspecies varchar(30) default NULL,
  verification_level varchar(20) default NULL,
  identifiers_text varchar(40) default NULL,
  identification_date date default NULL,
  idenfifiers_role varchar(30) default NULL,
  identification_qualifier varchar(20) default NULL,
  identification_qualifier_insert int(11) default NULL,
  name_comments varchar(200) default NULL,
  KEY unitID (unitID)
)

abcd_metadata

CREATE TABLE abcd_metadata (
  metaID int(11) NOT NULL default '0',
  dataset_title varchar(100) default NULL,
  technical_contact_name varchar(100) default NULL,
  date_modified date default NULL,
  owner varchar(100) default NULL,
  source_id varchar(10) default NULL,
  PRIMARY KEY  (metaID),
  KEY source_id (source_id)
)

abcd_typification

CREATE TABLE abcd_typification (
  unitID varchar(10) default NULL,
  type_status varchar(20) default NULL,
  basionym varchar(100) default NULL,
  KEY unitID (unitID)
)

abcd_unit

CREATE TABLE abcd_unit (
  unitID varchar(10) NOT NULL default ,
  herbarium_source varchar(6) NOT NULL default ,
  scientific_name varchar(255) default NULL,
  collector_number varchar(15) default NULL,
  collection_date date default NULL,
  generalised_locality varchar(30) default NULL,
  declat_fudged float default NULL,
  declong_fudged float default NULL,
  geocode_source enum('collector','compiler','generalised arbitrary point','automatically generated') default NULL,
  geocode_precision_fudged int(11) default NULL,
  genus varchar(30) default NULL,
  species varchar(30) default NULL,
  rank varchar(15) default NULL,
  infraspecies varchar(30) default NULL,
  date_last_edited date default NULL,
  family varchar(30) default NULL,
  declat double default NULL,
  declong double default NULL,
  geocode_precision int(11) default NULL,
  locality text,
  alt int(11) default NULL,
  depth int(11) default NULL,
  coordinates_text varchar(100) default NULL,
  spatial_datum varchar(10) default NULL,
  notes text,
  country varchar(40) default NULL,
  duplicates varchar(153) default NULL,
  donor varchar(25) default NULL,
  cultivated char(2) default NULL,
  PRIMARY KEY  (unitID),
  KEY scientific_name_index (scientific_name),
  KEY latlong (declat_fudged,declong_fudged)
)

The conversion script

This SQL script gets run to convert the data in the specimen table into the various abcd tables (note, this is still a work in progress, so not all data elements are being created just yet).

   -- in this script, the data will be set by using sed to change
   -- the <date> value
   SET @datefrom = <date>;

   -- load the unit table
   REPLACE LOW_PRIORITY INTO abcd_unit (
   unitID,
   herbarium_source,
   scientific_name,
   genus,
   species,
   rank,
   infraspecies,
   family,
   collector_number,
   collection_date,
   generalised_locality,
   declat_fudged,
   declong_fudged,
   geocode_precision_fudged,
   declat,
   declong,
   geocode_precision,
   geocode_source,
   date_last_edited,
   locality,
   alt,
   depth,
   coordinates_text,
   spatial_datum,
   notes,
   country,
   duplicates,
   donor,
   cultivated)


   SELECT 

   -- unitID
   CONCAT(HERBKEY_2,HERBKEY_3),

   -- herbarium_source
   "MEL",

   -- scientific_name
   CONCAT_WS(" ", genus, species, infrank, infraspn),

   -- genus
   genus,

   -- species
   species,

   -- rank
   CASE infrank WHEN "ssp" 	THEN "subsp."
            WHEN "ssp." 	THEN "subsp."
            WHEN "subsp"	THEN "subsp."
            WHEN "f" 		THEN "f."
            WHEN "forma" 	THEN "f."
            WHEN "var" 	THEN "var."
            ELSE NULL
   END
   ,

   -- infraspecies
   infraspn,

   --family
   family,

   -- collectors_number
   number,

   -- collection_date
   CONCAT_WS("-",
       colldat_3,
       CASE colldat_2  WHEN "Jan" THEN 01
                       WHEN "Feb" THEN 02
                       WHEN "Mar" THEN 03
                       WHEN "Apr" THEN 04
                       WHEN "May" THEN 05
                       WHEN "Jun" THEN 06
                       WHEN "Jul" THEN 07
                       WHEN "Aug" THEN 08
                       WHEN "Sep" THEN 09
                       WHEN "Oct" THEN 10
                       WHEN "Nov" THEN 11
                       WHEN "Dec" THEN 12
                       ELSE NULL
            END,
       colldat_1),

   -- generalised_locality	
   "Nearest locality not available",

   -- declat_fudged
   CASE lat1_4 WHEN "S" THEN -1.00 WHEN "N" THEN 1.00 END * 
   (lat1_1 * 1.00)
   + ((FLOOR(lat1_2/10.0) * 10.0)/60.00),

   -- declong_fudged
   CASE long1_4 WHEN "W" THEN -1.00 WHEN "E" THEN 1.00 END * 
   (long1_1 * 1.00)
   + ((FLOOR(long1_2/10.0) * 10.0)/60.00),

   -- geocode_precision_fudged
   -- (note adding in extra for fudging of geocode)
   CASE precisio WHEN 1 THEN 50
                 WHEN 2 THEN 1000
                 WHEN 3 THEN 10000
                 WHEN 4 THEN 250000
                 WHEN 5 THEN 110000
                 ELSE NULL
                 END
   + (18500),
   CASE lat1_4 WHEN "S" THEN -1.00 
               WHEN "N" THEN  1.00 END
   * (lat1_1 * 1.000)
   + (lat1_2/60.00)
   + (lat1_3/3600.0),

   -- declong
   CASE long1_4 WHEN "W" THEN -1.00 WHEN "E" THEN 1.00 END 
   * (long1_1 * 1.000)
   + (long1_2/60.00)
   + (long1_3/3600.0),

   -- geocode_precision
   CASE precisio WHEN 1 THEN 50
                 WHEN 2 THEN 1000
                 WHEN 3 THEN 10000
                 WHEN 4 THEN 250000
                 WHEN 5 THEN 110000
                 ELSE NULL END ,

   -- geocode_source
   CASE source WHEN 1 THEN "collector"
           WHEN 2 THEN "compiler"
           WHEN 3 THEN "compiler"
           WHEN 4 THEN "collector"
           ELSE NULL
           END,

   -- date_last_edited
   dateedit,

   -- locality
   locality,

   -- alt
   altm,

   -- depth
   depth,

   -- coordinates_text
   IF(lat1_1 OR long1_1,CONCAT_WS(",",lat1_1, lat1_2, lat1_3, lat1_4, long1_1, long1_2, long1_3, long1_4),NULL),

   -- spatial_datum
   "gda94",

   -- notes
   notes,

   -- country
   country,

   -- duplicates
   duplicat,

   -- donor
   exsiccat,

   -- cultivated
   IF(cultivat = "Y", "G", "W")


   FROM specimens WHERE dateedit >= @datefrom;


   -- now load the abcd_collector table
   -- but first delete records that relate to the data we are loading

   DELETE LOW_PRIORITY from abcd_collector USING abcd_collector, abcd_unit
   WHERE abcd_unit.unitID = abcd_collector.unitID AND abcd_unit.date_last_edited >= @datefrom;

   REPLACE LOW_PRIORITY INTO abcd_collector (
   collector_name,
   sequence,
   primary_flag,
   unitID)
   SELECT 
   collectr, 	-- collector_name
   1,		-- sequence
   1,		-- primary collector flag
   CONCAT(HERBKEY_2,HERBKEY_3)-- unitID
   FROM specimens WHERE dateedit >= @datefrom;



   -- delete old abcd_area data
   DELETE LOW_PRIORITY FROM abcd_area USING abcd_area, abcd_unit
   WHERE abcd_unit.unitID = abcd_area.unitID AND abcd_unit.date_last_edited >= @datefrom;

   -- load in state
   REPLACE LOW_PRIORITY INTO abcd_area (
   area_name,
   area_class,
   unitID)
   SELECT 
   bru,
   "State",
   CONCAT(HERBKEY_2,HERBKEY_3)
   FROM specimens 
   WHERE bru IS NOT NULL AND dateedit >= @datefrom;

   -- load in country
   REPLACE LOW_PRIORITY INTO abcd_area (
   area_name,
   area_class,
   unitID)
   SELECT 
   country,
   "Country",
   CONCAT(HERBKEY_2,HERBKEY_3)
   FROM specimens 
   WHERE bru IS NOT NULL AND dateedit >= @datefrom;

   -- delete LOW_PRIORITY from biotope
   DELETE FROM abcd_biotope USING abcd_biotope, abcd_unit
   WHERE abcd_unit.unitID = abcd_biotope.unitID AND abcd_unit.date_last_edited >= @datefrom;



   -- load into biotope
   INSERT LOW_PRIORITY INTO abcd_biotope (unitID,parameter,value,is_quantitative)
   SELECT CONCAT(HERBKEY_2,HERBKEY_3),"habitat",habitat,0
   FROM specimens where habitat IS NOT NULL AND dateedit >= @datefrom;

   INSERT LOW_PRIORITY INTO abcd_biotope (unitID,parameter,value,is_quantitative)
   SELECT CONCAT(HERBKEY_2,HERBKEY_3),"form",habit,0
   FROM specimens where habit IS NOT NULL AND dateedit >= @datefrom;


   -- load in district (botanical region / district)
   REPLACE LOW_PRIORITY INTO abcd_area (
   area_name,
   area_class,
   unitID)
   SELECT 
   regiondi,
   "sru",
   CONCAT(HERBKEY_2,HERBKEY_3)
   FROM specimens WHERE regiondi IS NOT NULL AND dateedit >= @datefrom;
 
   -- load into higher_taxon
   DELETE LOW_PRIORITY from abcd_higher_taxon USING abcd_higher_taxon, abcd_unit
   WHERE abcd_unit.unitID = abcd_higher_taxon.unitID AND abcd_unit.date_last_edited >= @datefrom;
 
   REPLACE LOW_PRIORITY INTO abcd_higher_taxon(
   name, rank, unitID)
   SELECT 
   family, "family", CONCAT(HERBKEY_2,HERBKEY_3)
   FROM specimens WHERE family IS NOT NULL AND dateedit >= @datefrom;
 
   -- update the date last updated in the metadata
   update abcd_metadata SET date_modified = NOW() WHERE source_id = "MEL";