view database/schema/classify.sql @ 44:812c956dd0e9

Add another admin queue type for comments on nonexistent PRs. Also, correct stupid spelling mistake. Need to get the test harness running again.
author David A. Holland
date Mon, 16 Jun 2014 01:27:45 -0400
parents cd36b49f4437
children 40f64a96481f
line wrap: on
line source

--
-- Classification.
--
--
-- Sorting and classifying bug reports usefully is a nontrivial
-- problem. For this reason, rather than bake in any particular
-- taxonomy, we allow multiple arbitrary classification schemes,
-- all of which are created equal. (Administratively, some will
-- be more equal than others, but that does not concern the
-- database.)
--
-- There are several classification schemes/taxonomies that already
-- exist or have been invented but not yet deployed. These fall into
-- the following categories:
--
-- 1. Hierarchical taxonomy. The only existing hierarchical taxonomy
-- is the one dholland uses in the out-of-GNATS PR lists, which
-- classifies PRs by location in the system. A second projected one is
-- classification by consequences (crashes, hangs, etc.) The possible
-- classifications are defined in advance and stored in the database.
--
-- 2. Flat taxonomy, that is, a selection from an enumeration of
-- values. Existing schemes of this type include:
--
--   * the old GNATS scheme of categories (bin, kern, lib, etc.)
--   * the old GNATS scheme of classes (sw-bug, doc-bug, etc.)
--   * the old GNATS severity field (critical, serious, non-critical)
--   * the old GNATS priority field (high, medium, low)
--
-- For these the possible values are defined in advance and stored in
-- the database.
--
-- 3. String value. Most of the time this is really a flat taxonomy
-- where the number of possible values is large or changes dynamically
-- with external circumstances, or both, such that storing a copy of
-- the legal values in the database would create a maintenance hassle.
-- There are none of these at present; the only projected one at the
-- moment is FreeBSD's idea of "the nearest man page to where the
-- problem appears to be".
--
-- 4. Systems of tags, that is, zero or more selections from a list
-- of possible values. Current schemes of this type include the
-- [456]-ONLY tags, the 6-CRITICAL/6-IMPORTANT/etc. release branch
-- tags, the STUCK tag, EASY, PATCH, PULLUPS-NEEDED, etc. Some of
-- these should be separate flat taxonomy schemes rather than tags,
-- and some of them should maybe be states rather than tags too. 
-- (And, perhaps some of the current states should be tags...)
-- In addition we want to allow every developer to have their own
-- private tags that aren't exposed. And, we might want to have
-- project-only tags that other developers can see but users can't.
-- (It remains to be seen if that's really a good idea.)
--
-- Note also that a list of PRs is equivalent to a tag on those PRs.
--
-- It is also possible that there may be additional classifications
-- (possibly of any of the above types, but most likely tags) that are
-- views rather than data, that is, derived from other information.
-- One example of this is "PRs in feedback more than 6 months".
--
--
-- We assume that new classification schemes will be added on the fly,
-- not frequently, but frequently enough that the list of schemes
-- should not be hard-coded into programs that access the database,
-- and we don't want to have to create a new table for each new scheme
-- either.
--
-- Therefore, the way I'll do this is to create one table (or rather,
-- one family of tables) for each type of classification scheme, and
-- make them able to handle arbitrary instances.
--
-- For hierarchical schemes there is no point trying to encode the
-- hierarchical structure in SQL; that is a waste of time. Instead
-- we'll expand the tree of allowed values and rely on the access
-- software to present the schemes sensibly.
--
-- It remains unclear what views ought to be defined.
--
--
-- Notes on the representations:
--    * a scheme is "total" if it should be defined on all PRs
--    * if the data contains a null value for a particular scheme,
--      that means "not specified yet".
--    * the hierarchical and flat schemes are no different schema-
--      wise but are separate to allow different handling in software.
--    * the ordering field in the values tables should be used so that
--      ORDER BY ordering ASCENDING produces the desired output order
--      of the legal values.
--    * the obsolete field for values is for entries that are allowed
--      to still exist in the database but that should not be used
--      with new PRs or for new classifications of old PRs.
--    * the ordering field for the classifications themselves should
--      be used so that ORDER BY ordering ASCENDING produces the
--      desired output order of the classification schemes. The
--      numbers are global across classification scheme types so
--      schemes of the same type do not need to be sorted together.
--
--

------------------------------------------------------------
-- Hierarchical schemes.

-- names and properties of the hierarchical schemes
CREATE TABLE hierclass_names (
	name text		primary key,
	ordering int		not null,
	total boolean		not null,
	description text	not null
)
WITHOUT OIDS;

-- allowed values of the hierarchical schemes
CREATE TABLE hierclass_values (
	scheme text		not null references hierclass_names (name),
	value text		not null,
	ordering int		not null,
	obsolete boolean	not null,
	description text	not null,
	primary key (scheme, value)
)
WITHOUT OIDS;

-- classification of PRs according to the hierarchical schemes
CREATE TABLE hierclass_data (
	pr bigint		not null references PRs (id),
	scheme text		not null references hierclass_names (name),
	value text		,
	primary key (pr, scheme),
	--
	-- Ok, the intended semantics here are:
	--    * For PRs that the classification doesn't apply to, there
	--      should be no row at all in the *class_data table.
	--    * For PRs where the classification is undetermined or
	--      unassigned, the value column should be null, and if
	--      that's allowed there should be a null entry in
	--      the *class_values table.
	-- I believe "MATCH SIMPLE" to be capable of enforcing this,
	-- but that's from reading postgres docs and not from actually
	-- checking it. From the description, "MATCH FULL" will do the
	-- wrong thing, and "MATCH PARTIAL" isn't supported.
	--
	foreign key (scheme, value) references hierclass_values (scheme, value)
		MATCH FULL
		ON DELETE RESTRICT
		ON UPDATE RESTRICT
)
WITHOUT OIDS;

------------------------------------------------------------
-- Enumerated flat schemes.

-- names and properties of the flat schemes
CREATE TABLE flatclass_names (
	name text		primary key,
	ordering int		not null,
	total boolean		not null,
	description text	not null
)
WITHOUT OIDS;

-- allowed values of the flat schemes
CREATE TABLE flatclass_values (
	scheme text		not null references flatclass_names (name),
	value text		not null,
	ordering int		not null,
	obsolete boolean	not null,
	description text	not null,
	primary key (scheme, value)
)
WITHOUT OIDS;

-- classification of PRs according to the flat schemes
CREATE TABLE flatclass_data (
	pr bigint		not null references PRs (id),
	scheme text		not null references flatclass_names (name),
	value text		,
	primary key (pr, scheme),
	-- as above
	foreign key (scheme, value) references flatclass_values (scheme, value)
		MATCH FULL
		ON DELETE RESTRICT
		ON UPDATE RESTRICT
)
WITHOUT OIDS;

------------------------------------------------------------
-- Text schemes.

-- names and properties of the text schemes
CREATE TABLE textclass_names (
	name text		primary key,
	ordering int		not null,
	total boolean		not null,
	description text	not null
)
WITHOUT OIDS;

-- classification of PRs according to the text schemes
CREATE TABLE textclass_data (
	pr bigint		not null references PRs (id),
	scheme text		not null references textclass_names (name),
	value text		,
	primary key (pr, scheme)
)
WITHOUT OIDS;

------------------------------------------------------------
-- Tag schemes.

-- names and properties of the tag schemes
-- (total does not make sense here)
CREATE TABLE tagclass_names (
	name text		primary key,
	ordering int		not null,
	description text	not null
)
WITHOUT OIDS;

-- allowed values of the tag schemes
-- (each PR can reference zero or more of these)
CREATE TABLE tagclass_values (
	scheme text		not null references tagclass_names (name),
	value text		not null,
	ordering int		not null,
	obsolete boolean	not null,
	description text	not null,
	primary key (scheme, value)
)
WITHOUT OIDS;

-- classification of PRs according to the tag schemes
-- (each PR/scheme pair is listed once for each tag attached to it)
CREATE TABLE tagclass_data (
	pr bigint		not null references PRs (id),
	scheme text		not null references tagclass_names (name),
	value text		,
	-- as above
	foreign key (scheme, value) references tagclass_values (scheme, value)
		MATCH FULL
		ON DELETE RESTRICT
		ON UPDATE RESTRICT
)
WITHOUT OIDS;

-- view where tags are collected into a string
-- string_agg() concatenates the value strings
CREATE VIEW tagclass_stringdata AS
   SELECT pr,
	  tagclass_data.scheme as scheme,
	  string_agg(tagclass_data.value, ' '
			ORDER BY tagclass_values.ordering) as value
   FROM tagclass_data, tagclass_values
   WHERE
	tagclass_data.scheme = tagclass_values.scheme
    AND tagclass_data.value = tagclass_values.value
   GROUP BY pr, tagclass_data.scheme
;

------------------------------------------------------------
-- some views

-- views that pull in the ordering fields.
--
-- (If you use only one of the ordering fields, as is likely to be
-- the case in practice, I think we can rely on the query optimizer
-- to drop the join that collects in the other ordering field. If
-- not, maybe this should be restructured.)
--
-- XXX what do these do when _data.value is null?

CREATE VIEW hierclass_data_ordered AS
   SELECT hierclass_data.pr as pr,
          hierclass_data.scheme as scheme,
          hierclass_names.ordering as schemeordering,
          hierclass_data.value as value,
          hierclass_values.ordering as valueordering
   FROM hierclass_data, hierclass_names, hierclass_values
   WHERE
        hierclass_data.scheme = hierclass_names.name
    AND hierclass_data.scheme = hierclass_values.scheme
    AND hierclass_data.value = hierclass_values.value
;

CREATE VIEW flatclass_data_ordered AS
   SELECT flatclass_data.pr as pr,
          flatclass_data.scheme as scheme,
          flatclass_names.ordering as schemeordering,
          flatclass_data.value as value,
          flatclass_values.ordering as valueordering
   FROM flatclass_data, flatclass_names, flatclass_values
   WHERE
        flatclass_data.scheme = flatclass_names.name
    AND flatclass_data.scheme = flatclass_values.scheme
    AND flatclass_data.value = flatclass_values.value
;

--
-- XXX what I'd like to do here is produce textclass_data.value as
-- valueordering; then sorting by valueordering would produce the
-- desired results (the entries sorted by text order of the values).
-- That produces a type conflict though if we combine this view
-- with the other ordered views where the ordering is a number.
--
-- Next best would be to order by textclass_data.value in this view
-- and produce the row number of the result (or some other fresh
-- sequence) as valueordering. However, I don't think that's possible.
--
-- What's here (using a fixed value of 1) will run but it's not
-- particularly desirable.
--
CREATE VIEW textclass_data_ordered AS
   SELECT textclass_data.pr as pr,
          textclass_data.scheme as scheme,
          textclass_names.ordering as schemeordering,
          textclass_data.value as value,
          1 as valueordering
   FROM textclass_data, textclass_names
   WHERE
        textclass_data.scheme = textclass_names.name
;

CREATE VIEW tagclass_data_ordered AS
   SELECT tagclass_data.pr as pr,
          tagclass_data.scheme as scheme,
          tagclass_names.ordering as schemeordering,
          tagclass_data.value as value,
          tagclass_values.ordering as valueordering
   FROM tagclass_data, tagclass_names, tagclass_values
   WHERE
        tagclass_data.scheme = tagclass_names.name
    AND tagclass_data.scheme = tagclass_values.scheme
    AND tagclass_data.value = tagclass_values.value
;

-- ordered version of tagclass_stringdata
--
-- Note that schemeordering is uniquely defined by the group-by
-- and the min() on it is there to satisfy the cookiemonster.
--
CREATE VIEW tagclass_stringdata_ordered AS
   SELECT pr, scheme, min(schemeordering),
          string_agg(value, ' ' ORDER BY valueordering) as value,
          min(valueordering) as valueordering
   FROM tagclass_data_ordered
   GROUP BY pr, scheme
;


-- a view that combines all the classification data.

CREATE VIEW classifications AS
         SELECT * FROM hierclass_data_ordered
   UNION SELECT * FROM flatclass_data_ordered
   UNION SELECT * FROM textclass_data_ordered
   UNION SELECT * FROM tagclass_stringdata_ordered
   ORDER BY schemeordering
;