MEL install and database mapping

This page describes one approach to the process of installing a BioCase provider and mirroring a texpress database to use as the data provider. In summary, a single mysql table is used to represent the texpress data, while a number of abcd tables are created. Then there is a big SQL script that runs from a cron script that does the updating from the single table to the abcd tables.

MySQL mirror
The Texpress database is mirrored in the following single table mysql database:

CREATE TABLE specimens (       additcol VARCHAR(90) NULL,	# additional collectors        altm INT NULL,			# altitude in metres (ft not needed)        authorit VARCHAR(80) NULL,	# the species authority        basio2 VARCHAR(66) NULL,	# basionym 2        basio3 VARCHAR(66) NULL,	# basionym 3        basionym VARCHAR(66) NULL,	# oldest basionym        bru VARCHAR(30) NULL,		# TDWG level 4        carpolco CHAR(2) NULL,		# carpological collection        ciba CHAR(2) NULL, 		# ciobachrome        class CHAR(2) NULL,		# Y if a dry sheet        collcode CHAR(3) NULL,		# internal use for unknown collectors        colldat_1 INT NULL,		# collection date day        colldat_2 CHAR(3) NULL,		# colleciton month        colldat_3 INT NULL,		# collection year        collectr VARCHAR(65) NULL,	# collector        colourtr CHAR(2) NULL,		# colour transparency        confdat_1 INT NULL,		# day of confirmation confdat_2 CHAR(3) NULL,		# month of confirmation confdat_3 INT NULL,		# year of confirmation confirme VARCHAR(30) NULL,	# who confirmed specimen country VARCHAR(30) NULL,	# TDWG level 2 cultivat CHAR(2) NULL,		# Cultivated specimen datecrea DATE NULL,		# date record created dateedit DATE NULL,		# date record last edited declat DOUBLE NULL,		# decimal lat created on fly declong DOUBLE NULL,		# decimal long created on fly depth INT NULL, detby VARCHAR(30) NULL,		# determiner detdat_1 INT NULL,		# determined on day DD       detdat_2 CHAR(3) NULL,		# determined on month MMM detdat_3 INT NULL,		# determined on YEAR YYYY duplicat VARCHAR(135) NULL,	# destination of replicates duptypst VARCHAR(15) NULL,	# type status of dupliates exsiccat VARCHAR(25) NULL,	# who donated specimen extra CHAR(2) NULL,		# there is extra info on the specimen family VARCHAR(35) NULL, genus VARCHAR(25) NULL, geonum TINYINT NULL,		# TDWG Level 1 number georeg VARCHAR(20) NULL,	# TDWG level 1 gridrefe_1 CHAR(1) NULL,	# major grid gridrefe_2 TINYINT NULL,	# minor grid gridrefe_3 CHAR(1) NULL,	# major grid 2 gridrefe_4 TINYINT NULL,	# minor grid 2 habit TEXT NULL, habitat TEXT NULL, hcommonn VARCHAR(60) NULL,	# host common name HERBKEY_1 VARCHAR(10) NOT NULL,	# PRIMARY KEY HERBKEY_2 INT NOT NULL,		# PRIMARY KEY HERBKEY_3 CHAR(1) NOT NULL,	# PRIMARY KEY hgenus VARCHAR(30) NULL,	# host genus hinfrasp VARCHAR(35) NULL,	# host infraspecific name hortref CHAR(2) NULL,		# part of hort ref collection hostfami VARCHAR(35) NULL,	# host family hspecies VARCHAR(40) NULL,	# host species illustra CHAR(2) NULL,		# an illustration with specimen infrank VARCHAR(5) NULL,	# infraspecific rank infraspn VARCHAR(50) NULL,	# infraspecific name infraspq VARCHAR(10) NULL,	# infraspecific qualifier infrauth VARCHAR(58) NULL,	# infraspecific authority intro CHAR(2) NULL,		# if species is introduced lat1_1 SMALLINT NULL,		# lat degrees lat1_2 TINYINT NULL,		# lat minutes lat1_3 TINYINT NULL,		# lat seconds lat1_4 CHAR(1) NULL,		# lat dir letter CHAR(2) NULL, 		# the jar size lichenla CHAR(2) NULL,		# if barcodes have been printed locality TEXT NULL, long1_1 SMALLINT NULL,		# long degrees long1_2 TINYINT NULL,		# long minutes long1_3 TINYINT NULL,		# long seconds long1_4 CHAR(1) NULL,		# long direction melfamil SMALLINT NULL,		# the internal numbering of families mixednot VARCHAR(150) NULL,	# notes to distinguish parts of a mixed coll. multiple VARCHAR(150) NULL,	# notes on how this might relate to another rec native CHAR(2) NULL,		# is a native species neg CHAR(2) NULL,		# a photographic negative with specimen newloan_1 SMALLINT NULL,	# the year component of the loan number newloan_2 SMALLINT NULL,	# the number component of the loan number notes TEXT NULL, number VARCHAR(12) NULL,	# the collectors field number origherb CHAR(5) NULL, 		# which herb has the specimen for this image phenolog_1 CHAR(2) NULL, 	# FL if flowers phenolog_2 CHAR(2) NULL,       # FR if fruits phenolog_3 CHAR(2) NULL,       # BD if buds phenolog_4 CHAR(2) NULL,       # LL if leafless phenolog_5 CHAR(2) NULL,       # FE if fertile phenolog_6 CHAR(2) NULL,       # ST if sterile photo CHAR(2) NULL,		# a photo with specimen precisio TINYINT NULL,		# the precision of the geocode priornam VARCHAR(135) NULL,	# equiv to name comments prot3 VARCHAR(64) NULL,		# protologue 3 proto2 VARCHAR(64) NULL,	# protologue 2 protolog VARCHAR(100) NULL,	# the oldest protologue seedling VARCHAR(12) NULL,	# a cross refereced seedling number slide CHAR(2) NULL,		# if there is a slide slideno VARCHAR(10) NULL,	# the slide number source TINYINT NULL,		# source of collection species VARCHAR(50) NULL, spirit CHAR(2) NULL,		# if there is a spirit collection spiritco VARCHAR(10) NULL,	# the spirit number spqual VARCHAR(10) NULL,	# species qualifier state VARCHAR(28) NULL,		# TDWG level 3 substrat VARCHAR(75) NULL,	# substrate supragrp CHAR(2) NULL,		# supra group F,D,M,A,B,H etc synonym VARCHAR(67) NULL,	# no longer used transferred DATETIME NULL,	# date transferred typedby VARCHAR(8) NULL,	# who created record typeof VARCHAR(67) NULL,	# no longer used typest3 VARCHAR(38) NULL,	# type status 3 typesta2 VARCHAR(38) NULL,	# type status 2 typestat VARCHAR(20) NULL,	# oldest type status typey CHAR(2) NULL,		# if this is a type userid VARCHAR(8) NULL,		# who last edited record voucher CHAR(2) NULL,		# is a voucher for something INDEX (genus, species, infraspn), INDEX (family), INDEX (newloan_1, newloan_2), INDEX (supragrp), INDEX (detby), INDEX (confirme), INDEX (collectr, number), INDEX (colldat_3, colldat_2,colldat_1), INDEX (georeg), INDEX (country), INDEX (state), INDEX (bru), INDEX (declat, declong), INDEX (locality(200)), INDEX (habitat(200)), INDEX (habit(200)), INDEX (notes(200)), INDEX (phenolog_1), INDEX (phenolog_2), INDEX (phenolog_3), INDEX (phenolog_4), INDEX (phenolog_5), INDEX (phenolog_6), INDEX (typedby), INDEX (userid), INDEX (datecrea), INDEX (dateedit), PRIMARY KEY (HERBKEY_1, HERBKEY_2, HERBKEY_3) );

ABCD tables
The following tables were created to better reflect the structure of ABCD

abcd_area
CREATE TABLE abcd_area (  area_name varchar(150) default NULL,   area_class varchar(30) default NULL,   unitID varchar(10) default NULL,   areaID int(11) NOT NULL auto_increment,   PRIMARY KEY  (areaID),   KEY unitID (unitID) )

abcd_biotope
CREATE TABLE abcd_biotope (  unitID varchar(10) default NULL,   parameter varchar(30) default NULL,   value text,   is_quantitative tinyint(4) default NULL,   biotopeID int(11) NOT NULL auto_increment,   PRIMARY KEY  (biotopeID),   KEY unitID (unitID) )

abcd_collector
CREATE TABLE abcd_collector (  collector_name varchar(255) default NULL,   sequence int(11) default NULL,   primary_flag int(11) default NULL,   unitID varchar(10) default NULL,   collectorID int(11) NOT NULL auto_increment,   PRIMARY KEY  (collectorID),   KEY unitID (unitID) )

abcd_higher_taxon
CREATE TABLE abcd_higher_taxon (  htID int(11) NOT NULL auto_increment,   name varchar(80) default NULL,   rank varchar(20) default NULL,   unitID varchar(10) default NULL,   PRIMARY KEY  (htID),   KEY unitID (unitID) )

abcd_identification
CREATE TABLE abcd_identification (  unitID varchar(10) default NULL,   higher_taxon_name varchar(30) default NULL,   higher_taxon_rank varchar(20) default NULL,   scientific_name varchar(100) default NULL,   genus varchar(30) default NULL,   name_addendum varchar(50) default NULL,   species varchar(30) default NULL,   author_team varchar(30) default NULL,   author_team_parenthesis varchar(30) default NULL,   rank varchar(10) default NULL,   infraspecies varchar(30) default NULL,   verification_level varchar(20) default NULL,   identifiers_text varchar(40) default NULL,   identification_date date default NULL,   idenfifiers_role varchar(30) default NULL,   identification_qualifier varchar(20) default NULL,   identification_qualifier_insert int(11) default NULL,   name_comments varchar(200) default NULL,   KEY unitID (unitID) )

abcd_metadata
CREATE TABLE abcd_metadata (  metaID int(11) NOT NULL default '0',   dataset_title varchar(100) default NULL,   technical_contact_name varchar(100) default NULL,   date_modified date default NULL,   owner varchar(100) default NULL,   source_id varchar(10) default NULL,   PRIMARY KEY  (metaID),   KEY source_id (source_id) )

abcd_typification
CREATE TABLE abcd_typification (  unitID varchar(10) default NULL,   type_status varchar(20) default NULL,   basionym varchar(100) default NULL,   KEY unitID (unitID) )

abcd_unit
CREATE TABLE abcd_unit (  unitID varchar(10) NOT NULL default ,   herbarium_source varchar(6) NOT NULL default ,   scientific_name varchar(255) default NULL,   collector_number varchar(15) default NULL,   collection_date date default NULL,   generalised_locality varchar(30) default NULL,   declat_fudged float default NULL,   declong_fudged float default NULL,   geocode_source enum('collector','compiler','generalised arbitrary point','automatically generated') default NULL,   geocode_precision_fudged int(11) default NULL,   genus varchar(30) default NULL,   species varchar(30) default NULL,   rank varchar(15) default NULL,   infraspecies varchar(30) default NULL,   date_last_edited date default NULL,   family varchar(30) default NULL,   declat double default NULL,   declong double default NULL,   geocode_precision int(11) default NULL,   locality text,   alt int(11) default NULL,   depth int(11) default NULL, coordinates_text varchar(100) default NULL, spatial_datum varchar(10) default NULL, notes text, country varchar(40) default NULL, duplicates varchar(153) default NULL, donor varchar(25) default NULL, cultivated char(2) default NULL, PRIMARY KEY (unitID), KEY scientific_name_index (scientific_name), KEY latlong (declat_fudged,declong_fudged) )

The conversion script
This SQL script gets run to convert the data in the specimen table into the various abcd tables (note, this is still a work in progress, so not all data elements are being created just yet). -- in this script, the data will be set by using sed to change -- the value SET @datefrom = ; -- load the unit table REPLACE LOW_PRIORITY INTO abcd_unit (   unitID,    herbarium_source,    scientific_name,    genus,    species,    rank,    infraspecies,    family,    collector_number,    collection_date,    generalised_locality,    declat_fudged,    declong_fudged,    geocode_precision_fudged,    declat,    declong,    geocode_precision,    geocode_source,    date_last_edited,    locality,    alt,    depth,    coordinates_text,    spatial_datum,    notes,    country,    duplicates,    donor,    cultivated) SELECT -- unitID CONCAT(HERBKEY_2,HERBKEY_3), -- herbarium_source "MEL", -- scientific_name CONCAT_WS(" ", genus, species, infrank, infraspn), -- genus genus, -- species species, -- rank CASE infrank WHEN "ssp" 	THEN "subsp." WHEN "ssp." THEN "subsp." WHEN "subsp"	THEN "subsp." WHEN "f" 		THEN "f." WHEN "forma" 	THEN "f." WHEN "var" 	THEN "var." ELSE NULL END ,   -- infraspecies infraspn, --family family, -- collectors_number number, -- collection_date CONCAT_WS("-",       colldat_3,        CASE colldat_2  WHEN "Jan" THEN 01                        WHEN "Feb" THEN 02                        WHEN "Mar" THEN 03                        WHEN "Apr" THEN 04                        WHEN "May" THEN 05                        WHEN "Jun" THEN 06                        WHEN "Jul" THEN 07                        WHEN "Aug" THEN 08                        WHEN "Sep" THEN 09                        WHEN "Oct" THEN 10                        WHEN "Nov" THEN 11                        WHEN "Dec" THEN 12                        ELSE NULL             END,        colldat_1), -- generalised_locality "Nearest locality not available", -- declat_fudged CASE lat1_4 WHEN "S" THEN -1.00 WHEN "N" THEN 1.00 END * (lat1_1 * 1.00) + ((FLOOR(lat1_2/10.0) * 10.0)/60.00), -- declong_fudged CASE long1_4 WHEN "W" THEN -1.00 WHEN "E" THEN 1.00 END * (long1_1 * 1.00) + ((FLOOR(long1_2/10.0) * 10.0)/60.00), -- geocode_precision_fudged -- (note adding in extra for fudging of geocode) CASE precisio WHEN 1 THEN 50 WHEN 2 THEN 1000 WHEN 3 THEN 10000 WHEN 4 THEN 250000 WHEN 5 THEN 110000 ELSE NULL END + (18500),   CASE lat1_4 WHEN "S" THEN -1.00 WHEN "N" THEN 1.00 END * (lat1_1 * 1.000) + (lat1_2/60.00) + (lat1_3/3600.0), -- declong CASE long1_4 WHEN "W" THEN -1.00 WHEN "E" THEN 1.00 END * (long1_1 * 1.000) + (long1_2/60.00) + (long1_3/3600.0), -- geocode_precision CASE precisio WHEN 1 THEN 50 WHEN 2 THEN 1000 WHEN 3 THEN 10000 WHEN 4 THEN 250000 WHEN 5 THEN 110000 ELSE NULL END , -- geocode_source CASE source WHEN 1 THEN "collector" WHEN 2 THEN "compiler" WHEN 3 THEN "compiler" WHEN 4 THEN "collector" ELSE NULL END, -- date_last_edited dateedit, -- locality locality, -- alt altm, -- depth depth, -- coordinates_text IF(lat1_1 OR long1_1,CONCAT_WS(",",lat1_1, lat1_2, lat1_3, lat1_4, long1_1, long1_2, long1_3, long1_4),NULL), -- spatial_datum "gda94", -- notes notes, -- country country, -- duplicates duplicat, -- donor exsiccat, -- cultivated IF(cultivat = "Y", "G", "W") FROM specimens WHERE dateedit >= @datefrom; -- now load the abcd_collector table -- but first delete records that relate to the data we are loading DELETE LOW_PRIORITY from abcd_collector USING abcd_collector, abcd_unit WHERE abcd_unit.unitID = abcd_collector.unitID AND abcd_unit.date_last_edited >= @datefrom; REPLACE LOW_PRIORITY INTO abcd_collector (   collector_name,    sequence,    primary_flag,    unitID) SELECT collectr, 	-- collector_name 1,		-- sequence 1,		-- primary collector flag CONCAT(HERBKEY_2,HERBKEY_3)-- unitID FROM specimens WHERE dateedit >= @datefrom; -- delete old abcd_area data DELETE LOW_PRIORITY FROM abcd_area USING abcd_area, abcd_unit WHERE abcd_unit.unitID = abcd_area.unitID AND abcd_unit.date_last_edited >= @datefrom; -- load in state REPLACE LOW_PRIORITY INTO abcd_area (   area_name,    area_class,    unitID) SELECT bru, "State", CONCAT(HERBKEY_2,HERBKEY_3) FROM specimens WHERE bru IS NOT NULL AND dateedit >= @datefrom; -- load in country REPLACE LOW_PRIORITY INTO abcd_area (   area_name,    area_class,    unitID) SELECT country, "Country", CONCAT(HERBKEY_2,HERBKEY_3) FROM specimens WHERE bru IS NOT NULL AND dateedit >= @datefrom; -- delete LOW_PRIORITY from biotope DELETE FROM abcd_biotope USING abcd_biotope, abcd_unit WHERE abcd_unit.unitID = abcd_biotope.unitID AND abcd_unit.date_last_edited >= @datefrom; -- load into biotope INSERT LOW_PRIORITY INTO abcd_biotope (unitID,parameter,value,is_quantitative) SELECT CONCAT(HERBKEY_2,HERBKEY_3),"habitat",habitat,0 FROM specimens where habitat IS NOT NULL AND dateedit >= @datefrom; INSERT LOW_PRIORITY INTO abcd_biotope (unitID,parameter,value,is_quantitative) SELECT CONCAT(HERBKEY_2,HERBKEY_3),"form",habit,0 FROM specimens where habit IS NOT NULL AND dateedit >= @datefrom; -- load in district (botanical region / district) REPLACE LOW_PRIORITY INTO abcd_area (   area_name,    area_class,    unitID) SELECT regiondi, "sru", CONCAT(HERBKEY_2,HERBKEY_3) FROM specimens WHERE regiondi IS NOT NULL AND dateedit >= @datefrom; -- load into higher_taxon DELETE LOW_PRIORITY from abcd_higher_taxon USING abcd_higher_taxon, abcd_unit WHERE abcd_unit.unitID = abcd_higher_taxon.unitID AND abcd_unit.date_last_edited >= @datefrom; REPLACE LOW_PRIORITY INTO abcd_higher_taxon(   name, rank, unitID) SELECT family, "family", CONCAT(HERBKEY_2,HERBKEY_3) FROM specimens WHERE family IS NOT NULL AND dateedit >= @datefrom; -- update the date last updated in the metadata update abcd_metadata SET date_modified = NOW WHERE source_id = "MEL";