Peter Mount's Blog
Various ramblings on virtually everything

Monday, December 29, 2008

Sets and Nodes with PostgreSQL

At times you need to be able gather information into Sets where an entry can contain one or more entries or sets, or Nodes where an entry has child nodes each of whom can contain their own children. For me it’s usually a geographical tree (Continent, Country, County, Town) or permission roles (Administrator, Moderator, User). Although this is documented elsewhere on the net, here’s how I do this in PostgreSQL utilising pl/pgsql functions.


The tree of nodes consists of one entry in the table per node, with the root node at the top of the tree. Selects on the table are fast, but updates are expensive. As the data contained in the tree rarely changes, this is perfectly fine for most purposes.


For the first example, I’m going to use a geographical tree, so we will have at the end of the exercise the following tree:



First we need a table to store the data. The following SQL Schema defines the node table and inserts the root. Apart from this instance, you should only use the pl/pgsql functions to modify this table:


CREATE SEQUENCE nodeseq;


CREATE TABLE node (

    nodeid BIGINT NOT NULL,

    l BIGINT NOT NULL,

    r BIGINT NOT NULL,

    d INTEGER NOT NULL, -- the depth of this node, 0 = root

    n name NOT NULL,

    PRIMARY KEY (nodeid)

);


CREATE INDEX node_l ON node(l);

CREATE INDEX node_r ON node(r);

CREATE INDEX node_lr ON node(l,r);

CREATE INDEX node_lrd ON node(l,r,d);

CREATE INDEX node_ln ON node(l,n);

CREATE INDEX node_rn ON node(r,n);

CREATE INDEX node_lrn ON node(l,r,n);

CREATE INDEX node_lrdn ON node(l,r,d,n);


-- The root node (0) preinitialised for the structure to work.

INSERT INTO node (nodeid,l,r,d,n) VALUES (0,1,2,0,'root');


Here we have the table initialised with the root node. At first it has the nodeid of 0, l=1, r=2 and d=0. Over time the l and r values will change as nodes are added to the tree.


Now for the first set of pl/pgsql functions. This set of functions handle retrieval of nodes from the tree based on certain criteria. I’ll only show the function definitions for now as I’ll show how to use these later in this article.


Retrieving a node by its nodeid


CREATE OR REPLACE FUNCTION node_get( parentid BIGINT )

RETURNS node

AS $$

DECLARE

    n      node;

BEGIN

    SELECT *

        INTO n

        FROM node

        WHERE nodeid=parentid;

    IF NOT FOUND THEN

        RAISE EXCEPTION 'Node % not found', parentid;

    END IF;

    RETURN n;

END

$$ LANGUAGE 'plpgsql';


Is the node a leaf, i.e. it contains no children


CREATE OR REPLACE FUNCTION node_isleaf( childnodeid BIGINT )

RETURNS boolean

AS $$

DECLARE

    child node;

BEGIN

    child := node_get( childnodeid );

    RETURN (child.r-child.l)==1;

END

$$ LANGUAGE 'plpgsql';


Does one node contain the other


CREATE OR REPLACE FUNCTION node_contains( parentnodeid BIGINT, childnodeid BIGINT )

RETURNS boolean

AS $$

BEGIN

    RETURN node_contains( node_get( parentnodeid ), node_get( childnodeid ) );

END

$$ LANGUAGE 'plpgsql';


CREATE OR REPLACE FUNCTION node_contains( parentnode node, childnode node )

RETURNS boolean

AS $$

BEGIN

    RETURN parentnode.l<=childnode.l AND parentnode.r>=childnode.r;

END

$$ LANGUAGE 'plpgsql';


Retrieve the parent of a node


CREATE OR REPLACE FUNCTION node_parent( rootnodeid BIGINT, childnodeid BIGINT )

RETURNS node

AS $$

DECLARE

    rootnode    node;

    startnode   node;

    parentnode  node;

BEGIN


    IF childnodeid = rootnodeid THEN

        RAISE EXCEPTION 'Node % has no parent', childnodeid;

    END IF;


    -- get the root

    rootnode := node_get( rootnodeid );


    -- get the start

    startnode := node_get( childnodeid );


    -- the root must contain the start

    IF NOT node_contains( rootnode, startnode ) THEN

        RAISE EXCEPTION 'Node % is not contained by the root node %', childnodeid, rootnodeid;

    END IF;


    -- The root is the parent

    IF rootnode.d = (startnode.d-1) THEN

        RETURN rootnode;

    END IF;


    SELECT *

        INTO parentnode

        FROM node

        WHERE l<startnode.l AND r>startnode.r AND d=startnode.d-1;

    IF NOT FOUND THEN

        RAISE EXCEPTION 'Node % has no parent', childnodeid;

    END IF;


    -- the root must contain the parent

    IF NOT node_contains( rootnode, parentnode ) THEN

        RAISE EXCEPTION 'Node % has no parent', childnodeid;

    END IF;


    RETURN parentnode;

END

$$ LANGUAGE 'plpgsql';


Add a child node


This is probably the most important function as it adds a new node as a child to another and as such is the primary means of adding nodes to the tree.


CREATE OR REPLACE FUNCTION node_addchild( parentid BIGINT, name NAME )

RETURNS node

AS $$

DECLARE

    parent  node;

    child   node;

    tmp     TEXT[];

BEGIN


    -- Validate name ensuring it contains no index

    SELECT regexp_matches( name, E'([^\\]]+)(\\[([^\\[]+)\\]){0,1}' )

        INTO tmp;

    IF tmp[2] IS NOT NULL THEN

        RAISE EXCEPTION 'Name % is invalid, it may not contain an index', name;

    END IF;


    parent := node_get( parentid );


    /* We allow same named children, but this would disallow it

    SELECT *

        INTO child

        FROM node

        WHERE l > parent.l AND r < parent.r AND n = name;

    IF FOUND THEN

        RAISE EXCEPTION 'The child % already exists under %', name, parentid;

    END IF;

    */


    UPDATE node

        SET r=r+2

        WHERE r >= parent.r;


    UPDATE node

        SET l=l+2

        WHERE l >= parent.r;


    INSERT INTO node (nodeid,l,r,d,n)

        VALUES (nextval('nodeseq'), parent.r, parent.r+1, parent.d+1, name);


    child := node_get( currval('nodeseq') );

    RETURN child;

END;

$$ LANGUAGE 'plpgsql';


Now using our example tree, the first thing we need to do is to add Europe to the tree as a child of the root. Now the root always has nodeid 0 so we call node_addchild( 0, ‘Europe’ ) to add a new node representing Europe:


test=# select * from node;

 nodeid | l | r | d |  n   

--------+---+---+---+------

      0 | 1 | 2 | 0 | root

(1 row)


test=# select * from node_addchild(0,'Europe');

 nodeid | l | r | d |   n    

--------+---+---+---+--------

      1 | 2 | 3 | 1 | Europe

(1 row)


test=# select * from node;

 nodeid | l | r | d |   n    

--------+---+---+---+--------

      0 | 1 | 4 | 0 | root

      1 | 2 | 3 | 1 | Europe

(2 rows)


Here you’ll notice that after the call to node_addchild() it returned the new node (nodeid 1) and updated the root node so it’s l and r fields bound those of it’s children.


We’ll now add the other nodes except the towns:


test=# select * from node_addchild(1,'United Kingdom');

 nodeid | l | r | d |       n        

--------+---+---+---+----------------

      2 | 3 | 4 | 2 | United Kingdom

(1 row)


test=# select * from node_addchild(2,'Kent');

 nodeid | l | r | d |  n   

--------+---+---+---+------

      3 | 4 | 5 | 3 | Kent

(1 row)


test=# select * from node_addchild(2,'Sussex');

 nodeid | l | r | d |   n    

--------+---+---+---+--------

      4 | 6 | 7 | 3 | Sussex

(1 row)


Adding a node by it’s path


Apart from node_addchild we can also add a node by it’s path. For Canterbury it’s path from Europe is “United Kingdom/Kent/Canterbury” so instead of having to find Kent, we can use the node_addchildbypath function:


test=# select * from node_addchildbypath( 0, 1, 'United Kingdom/Kent/Canterbury' );

 nodeid | l | r | d |     n      

--------+---+---+---+------------

      5 | 5 | 6 | 4 | Canterbury

(1 row)


Here’s the node_addchildbypath function:


CREATE OR REPLACE FUNCTION node_addchildbypath( rootnodeid BIGINT, parentid BIGINT, path TEXT )

RETURNS node

AS $$

DECLARE

    rootnode    node;

    parent      node;

    child       node;

    tmp         TEXT[];

    lastname    TEXT;

    path2       TEXT;

    name2       TEXT;

BEGIN


    rootnode := node_get( rootnodeid );

    parent := node_get( parentid );

    IF NOT node_contains( rootnode, parent ) THEN

        RAISE EXCEPTION 'Parent % is not contained by root %', parentid, rootnodeid;

    END IF;


    -- Extract the last path

    SELECT regexp_split_to_array( path, E'\\/' )

        INTO tmp;

    

    IF array_upper( tmp, 1 ) = 1 THEN

        child := node_addchild( parentid, path );

    ELSE

        -- remove the tail element

        name2 := tmp[ array_upper(tmp, 1 ) ];


        path2 := substr( path, 0, length(path) - length( name2 ) );


        parent := node_findbypath( rootnodeid, parentid, path2 );

        child := node_addchild( parent.nodeid, name2 );

    END IF;


    RETURN child;

END;

$$ LANGUAGE 'plpgsql';


Now we’ll add the last set of towns to the tree:


test=# select * from node_addchildbypath( 0, 1, 'United Kingdom/Kent/Maidstone' );

 nodeid | l | r  | d |     n     

--------+---+----+---+-----------

      6 | 7 | 8 | 4 | Maidstone

(1 row)


test=# select * from node_addchildbypath( 0, 1, 'United Kingdom/Kent/Tonbridge' );

 nodeid | l  | r  | d |     n     

--------+----+----+---+-----------

      7 | 9 | 10 | 4 | Tonbridge

(1 row)


test=# select * from node_addchildbypath( 0, 1, 'United Kingdom/Sussex/Arundel' );

 nodeid | l  | r  | d |    n    

--------+----+----+---+---------

      8 | 13 | 14 | 4  | Arundel

(1 row)


test=# select * from node;

 nodeid | l  | r  | d |       n        

--------+----+----+---+----------------

      6 |  7 |  8 | 4 | Maidstone

      3 |  4 | 11 | 3 | Kent

      7 |  9 | 10 | 4 | Tonbridge

      0 |  1 | 18 | 0 | root

      1 |  2 | 17 | 1 | Europe

      2 |  3 | 16 | 2 | United Kingdom

      4 | 12 | 15 | 3 | Sussex

      8 | 13 | 14 | 4 | Arundel

      5 |  5 |  6 | 4 | Canterbury

(9 rows)


Retrieving the children of a node


When handling sets, you can simply use the following to return all entries contained by a node - here we will ask for all nodes within the United Kingdom:


test=# select * from node_get(2);

 nodeid | l | r  | d |       n        

--------+---+----+---+----------------

      2 | 3 | 16 | 2 | United Kingdom

(1 row)


test=# select * from node where l > 3 and r < 16;

 nodeid | l  | r  | d |     n      

--------+----+----+---+------------

      6 |  7 |  8 | 4 | Maidstone

      3 |  4 | 11 | 3 | Kent

      7 |  9 | 10 | 4 | Tonbridge

      4 | 12 | 15 | 3 | Sussex

      8 | 13 | 14 | 4 | Arundel

      5 |  5 |  6 | 4 | Canterbury

(6 rows)


Now this is fine for sets, but when dealing with a tree we usually need to know those nodes immediately beneath the node, in this case the counties. This is where the node_children function is handy:


test=# select * from node_children(2);

 nodeid | l  | r  | d |   n    

--------+----+----+---+--------

      3 |  4 | 11 | 3 | Kent

      4 | 12 | 15 | 3 | Sussex

(2 rows)


Here’s the definition for node_children - here you’ll notice it’s similar to retrieving the contained entries but it now uses the d column:


CREATE OR REPLACE FUNCTION node_children( parentid BIGINT )

RETURNS SETOF node

AS $$

DECLARE

    parent node;

    r node%rowtype;

BEGIN


    parent := node_get( parentid );


    FOR r

        IN SELECT *

            FROM node

            WHERE l BETWEEN parent.l AND parent.r

                AND d = parent.d+1

            ORDER BY n, nodeid

    LOOP

        RETURN NEXT r;

    END LOOP;


    RETURN;

END;

$$ LANGUAGE 'plpgsql';


Deleting nodes


Deleting a node is not as simple as simply deleting the entry from the table. What we have to do is delete the entry including all of it’s children and then close the gap in the l & r columns where the deleted entries used to be. To handle this we have the node_deleteTree function. For this example we’ll add a new County, town then delete them from the tree:


test=# select * from node_addchildbypath( 0, 1, 'United Kingdom/Cornwall' );NOTICE:  name2 = CornwallNOTICE:  path2 = United Kingdom

 nodeid | l  | r  | d |    n     

--------+----+----+---+----------

      9 | 16 | 17 | 3 | Cornwall

(1 row)


test=# select * from node_addchildbypath( 0, 1, 'United Kingdom/Cornwall/St Austell' );

 nodeid | l  | r  | d |     n      

--------+----+----+---+------------

     10 | 17 | 18 | 4 | St Austell

(1 row)


test=# select * from node;

 nodeid | l  | r  | d |       n        

--------+----+----+---+----------------

      6 |  7 |  8 | 4 | Maidstone

      3 |  4 | 11 | 3 | Kent

      7 |  9 | 10 | 4 | Tonbridge

      4 | 12 | 15 | 3 | Sussex

      8 | 13 | 14 | 4 | Arundel

      0 |  1 | 22 | 0 | root

      1 |  2 | 21 | 1 | Europe

      2 |  3 | 20 | 2 | United Kingdom

      9 | 16 | 19 | 3 | Cornwall

     10 | 17 | 18 | 4 | St Austell

      5 |  5 |  6 | 4 | Canterbury

(11 rows)


test=# select node_deleteTree( 9 );

 node_deletetree 

-----------------

               2

(1 row)


test=# select * from node;

 nodeid | l  | r  | d |       n        

--------+----+----+---+----------------

      6 |  7 |  8 | 4 | Maidstone

      3 |  4 | 11 | 3 | Kent

      7 |  9 | 10 | 4 | Tonbridge

      4 | 12 | 15 | 3 | Sussex

      8 | 13 | 14 | 4 | Arundel

      0 |  1 | 18 | 0 | root

      1 |  2 | 17 | 1 | Europe

      2 |  3 | 16 | 2 | United Kingdom

      5 |  5 |  6 | 4 | Canterbury

(9 rows)


Here’s the definition for node_deleteTree:


CREATE OR REPLACE FUNCTION node_deleteTree( oldnodeid BIGINT )

RETURNS INTEGER

AS $$

DECLARE

    oldnode node;

    c       INTEGER := 0;

    d       INTEGER;

BEGIN

    -- Cannot delete the root

    IF oldnodeid = 0 THEN

        RAISE EXCEPTION 'Cannot delete the root';

    END IF;


    -- Select the node we are deleting

    oldnode := node_get( oldnodeid );


    -- delete the node

    DELETE FROM node

        WHERE l BETWEEN oldnode.l AND oldnode.r;

    GET DIAGNOSTICS c = ROW_COUNT;


    -- now close the gap between l and r, d is the size of the gap to remove

    d := oldnode.r - oldnode.l + 1;

    UPDATE node

        SET l = CASE

                WHEN l > oldnode.l

                    THEN l - d

                ELSE l

            END,

            r = CASE

                WHEN r > oldnode.l

                    THEN r - d

                ELSE r

            END

        WHERE l > oldnode.l

            OR r > oldnode.l;


    RETURN c;

END;

$$ LANGUAGE 'plpgsql';


The full schema used by this article is available at http://retep.org/blog/psql/nodes.txt 

No comments: