view database/schema/classify.sql @ 51:ef6d572c4e1e

switch to python3 style print()
author David A. Holland
date Sat, 02 Apr 2022 18:14:40 -0400
parents cd36b49f4437
children 40f64a96481f
line wrap: on
line source

--
-- Classification.
--
--
-- Sorting and classifying bug reports usefully is a nontrivial
-- problem. For this reason, rather than bake in any particular
-- taxonomy, we allow multiple arbitrary classification schemes,
-- all of which are created equal. (Administratively, some will
-- be more equal than others, but that does not concern the
-- database.)
--
-- There are several classification schemes/taxonomies that already
-- exist or have been invented but not yet deployed. These fall into
-- the following categories:
--
-- 1. Hierarchical taxonomy. The only existing hierarchical taxonomy
-- is the one dholland uses in the out-of-GNATS PR lists, which
-- classifies PRs by location in the system. A second projected one is
-- classification by consequences (crashes, hangs, etc.) The possible
-- classifications are defined in advance and stored in the database.
--
-- 2. Flat taxonomy, that is, a selection from an enumeration of
-- values. Existing schemes of this type include:
--
--   * the old GNATS scheme of categories (bin, kern, lib, etc.)
--   * the old GNATS scheme of classes (sw-bug, doc-bug, etc.)
--   * the old GNATS severity field (critical, serious, non-critical)
--   * the old GNATS priority field (high, medium, low)
--
-- For these the possible values are defined in advance and stored in
-- the database.
--
-- 3. String value. Most of the time this is really a flat taxonomy
-- where the number of possible values is large or changes dynamically
-- with external circumstances, or both, such that storing a copy of
-- the legal values in the database would create a maintenance hassle.
-- There are none of these at present; the only projected one at the
-- moment is FreeBSD's idea of "the nearest man page to where the
-- problem appears to be".
--
-- 4. Systems of tags, that is, zero or more selections from a list
-- of possible values. Current schemes of this type include the
-- [456]-ONLY tags, the 6-CRITICAL/6-IMPORTANT/etc. release branch
-- tags, the STUCK tag, EASY, PATCH, PULLUPS-NEEDED, etc. Some of
-- these should be separate flat taxonomy schemes rather than tags,
-- and some of them should maybe be states rather than tags too. 
-- (And, perhaps some of the current states should be tags...)
-- In addition we want to allow every developer to have their own
-- private tags that aren't exposed. And, we might want to have
-- project-only tags that other developers can see but users can't.
-- (It remains to be seen if that's really a good idea.)
--
-- Note also that a list of PRs is equivalent to a tag on those PRs.
--
-- It is also possible that there may be additional classifications
-- (possibly of any of the above types, but most likely tags) that are
-- views rather than data, that is, derived from other information.
-- One example of this is "PRs in feedback more than 6 months".
--
--
-- We assume that new classification schemes will be added on the fly,
-- not frequently, but frequently enough that the list of schemes
-- should not be hard-coded into programs that access the database,
-- and we don't want to have to create a new table for each new scheme
-- either.
--
-- Therefore, the way I'll do this is to create one table (or rather,
-- one family of tables) for each type of classification scheme, and
-- make them able to handle arbitrary instances.
--
-- For hierarchical schemes there is no point trying to encode the
-- hierarchical structure in SQL; that is a waste of time. Instead
-- we'll expand the tree of allowed values and rely on the access
-- software to present the schemes sensibly.
--
-- It remains unclear what views ought to be defined.
--
--
-- Notes on the representations:
--    * a scheme is "total" if it should be defined on all PRs
--    * if the data contains a null value for a particular scheme,
--      that means "not specified yet".
--    * the hierarchical and flat schemes are no different schema-
--      wise but are separate to allow different handling in software.
--    * the ordering field in the values tables should be used so that
--      ORDER BY ordering ASCENDING produces the desired output order
--      of the legal values.
--    * the obsolete field for values is for entries that are allowed
--      to still exist in the database but that should not be used
--      with new PRs or for new classifications of old PRs.
--    * the ordering field for the classifications themselves should
--      be used so that ORDER BY ordering ASCENDING produces the
--      desired output order of the classification schemes. The
--      numbers are global across classification scheme types so
--      schemes of the same type do not need to be sorted together.
--
--

------------------------------------------------------------
-- Hierarchical schemes.

-- names and properties of the hierarchical schemes
CREATE TABLE hierclass_names (
	name text		primary key,
	ordering int		not null,
	total boolean		not null,
	description text	not null
)
WITHOUT OIDS;

-- allowed values of the hierarchical schemes
CREATE TABLE hierclass_values (
	scheme text		not null references hierclass_names (name),
	value text		not null,
	ordering int		not null,
	obsolete boolean	not null,
	description text	not null,
	primary key (scheme, value)
)
WITHOUT OIDS;

-- classification of PRs according to the hierarchical schemes
CREATE TABLE hierclass_data (
	pr bigint		not null references PRs (id),
	scheme text		not null references hierclass_names (name),
	value text		,
	primary key (pr, scheme),
	--
	-- Ok, the intended semantics here are:
	--    * For PRs that the classification doesn't apply to, there
	--      should be no row at all in the *class_data table.
	--    * For PRs where the classification is undetermined or
	--      unassigned, the value column should be null, and if
	--      that's allowed there should be a null entry in
	--      the *class_values table.
	-- I believe "MATCH SIMPLE" to be capable of enforcing this,
	-- but that's from reading postgres docs and not from actually
	-- checking it. From the description, "MATCH FULL" will do the
	-- wrong thing, and "MATCH PARTIAL" isn't supported.
	--
	foreign key (scheme, value) references hierclass_values (scheme, value)
		MATCH FULL
		ON DELETE RESTRICT
		ON UPDATE RESTRICT
)
WITHOUT OIDS;

------------------------------------------------------------
-- Enumerated flat schemes.

-- names and properties of the flat schemes
CREATE TABLE flatclass_names (
	name text		primary key,
	ordering int		not null,
	total boolean		not null,
	description text	not null
)
WITHOUT OIDS;

-- allowed values of the flat schemes
CREATE TABLE flatclass_values (
	scheme text		not null references flatclass_names (name),
	value text		not null,
	ordering int		not null,
	obsolete boolean	not null,
	description text	not null,
	primary key (scheme, value)
)
WITHOUT OIDS;

-- classification of PRs according to the flat schemes
CREATE TABLE flatclass_data (
	pr bigint		not null references PRs (id),
	scheme text		not null references flatclass_names (name),
	value text		,
	primary key (pr, scheme),
	-- as above
	foreign key (scheme, value) references flatclass_values (scheme, value)
		MATCH FULL
		ON DELETE RESTRICT
		ON UPDATE RESTRICT
)
WITHOUT OIDS;

------------------------------------------------------------
-- Text schemes.

-- names and properties of the text schemes
CREATE TABLE textclass_names (
	name text		primary key,
	ordering int		not null,
	total boolean		not null,
	description text	not null
)
WITHOUT OIDS;

-- classification of PRs according to the text schemes
CREATE TABLE textclass_data (
	pr bigint		not null references PRs (id),
	scheme text		not null references textclass_names (name),
	value text		,
	primary key (pr, scheme)
)
WITHOUT OIDS;

------------------------------------------------------------
-- Tag schemes.

-- names and properties of the tag schemes
-- (total does not make sense here)
CREATE TABLE tagclass_names (
	name text		primary key,
	ordering int		not null,
	description text	not null
)
WITHOUT OIDS;

-- allowed values of the tag schemes
-- (each PR can reference zero or more of these)
CREATE TABLE tagclass_values (
	scheme text		not null references tagclass_names (name),
	value text		not null,
	ordering int		not null,
	obsolete boolean	not null,
	description text	not null,
	primary key (scheme, value)
)
WITHOUT OIDS;

-- classification of PRs according to the tag schemes
-- (each PR/scheme pair is listed once for each tag attached to it)
CREATE TABLE tagclass_data (
	pr bigint		not null references PRs (id),
	scheme text		not null references tagclass_names (name),
	value text		,
	-- as above
	foreign key (scheme, value) references tagclass_values (scheme, value)
		MATCH FULL
		ON DELETE RESTRICT
		ON UPDATE RESTRICT
)
WITHOUT OIDS;

-- view where tags are collected into a string
-- string_agg() concatenates the value strings
CREATE VIEW tagclass_stringdata AS
   SELECT pr,
	  tagclass_data.scheme as scheme,
	  string_agg(tagclass_data.value, ' '
			ORDER BY tagclass_values.ordering) as value
   FROM tagclass_data, tagclass_values
   WHERE
	tagclass_data.scheme = tagclass_values.scheme
    AND tagclass_data.value = tagclass_values.value
   GROUP BY pr, tagclass_data.scheme
;

------------------------------------------------------------
-- some views

-- views that pull in the ordering fields.
--
-- (If you use only one of the ordering fields, as is likely to be
-- the case in practice, I think we can rely on the query optimizer
-- to drop the join that collects in the other ordering field. If
-- not, maybe this should be restructured.)
--
-- XXX what do these do when _data.value is null?

CREATE VIEW hierclass_data_ordered AS
   SELECT hierclass_data.pr as pr,
          hierclass_data.scheme as scheme,
          hierclass_names.ordering as schemeordering,
          hierclass_data.value as value,
          hierclass_values.ordering as valueordering
   FROM hierclass_data, hierclass_names, hierclass_values
   WHERE
        hierclass_data.scheme = hierclass_names.name
    AND hierclass_data.scheme = hierclass_values.scheme
    AND hierclass_data.value = hierclass_values.value
;

CREATE VIEW flatclass_data_ordered AS
   SELECT flatclass_data.pr as pr,
          flatclass_data.scheme as scheme,
          flatclass_names.ordering as schemeordering,
          flatclass_data.value as value,
          flatclass_values.ordering as valueordering
   FROM flatclass_data, flatclass_names, flatclass_values
   WHERE
        flatclass_data.scheme = flatclass_names.name
    AND flatclass_data.scheme = flatclass_values.scheme
    AND flatclass_data.value = flatclass_values.value
;

--
-- XXX what I'd like to do here is produce textclass_data.value as
-- valueordering; then sorting by valueordering would produce the
-- desired results (the entries sorted by text order of the values).
-- That produces a type conflict though if we combine this view
-- with the other ordered views where the ordering is a number.
--
-- Next best would be to order by textclass_data.value in this view
-- and produce the row number of the result (or some other fresh
-- sequence) as valueordering. However, I don't think that's possible.
--
-- What's here (using a fixed value of 1) will run but it's not
-- particularly desirable.
--
CREATE VIEW textclass_data_ordered AS
   SELECT textclass_data.pr as pr,
          textclass_data.scheme as scheme,
          textclass_names.ordering as schemeordering,
          textclass_data.value as value,
          1 as valueordering
   FROM textclass_data, textclass_names
   WHERE
        textclass_data.scheme = textclass_names.name
;

CREATE VIEW tagclass_data_ordered AS
   SELECT tagclass_data.pr as pr,
          tagclass_data.scheme as scheme,
          tagclass_names.ordering as schemeordering,
          tagclass_data.value as value,
          tagclass_values.ordering as valueordering
   FROM tagclass_data, tagclass_names, tagclass_values
   WHERE
        tagclass_data.scheme = tagclass_names.name
    AND tagclass_data.scheme = tagclass_values.scheme
    AND tagclass_data.value = tagclass_values.value
;

-- ordered version of tagclass_stringdata
--
-- Note that schemeordering is uniquely defined by the group-by
-- and the min() on it is there to satisfy the cookiemonster.
--
CREATE VIEW tagclass_stringdata_ordered AS
   SELECT pr, scheme, min(schemeordering),
          string_agg(value, ' ' ORDER BY valueordering) as value,
          min(valueordering) as valueordering
   FROM tagclass_data_ordered
   GROUP BY pr, scheme
;


-- a view that combines all the classification data.

CREATE VIEW classifications AS
         SELECT * FROM hierclass_data_ordered
   UNION SELECT * FROM flatclass_data_ordered
   UNION SELECT * FROM textclass_data_ordered
   UNION SELECT * FROM tagclass_stringdata_ordered
   ORDER BY schemeordering
;