Wednesday 31 July 2019

How to display encoded byte stream of a varchar column in PostgreSQL

CREATE TABLE varchar_demo(description varchar(200));

INSERT INTO varchar_demo VALUES('こんにちは');
INSERT INTO varchar_demo VALUES('ABCDE');

SELECT * FROM varchar_demo;
 description 
-------------
 こんにちは
 ABCDE
(2 rows)


CREATE OR REPLACE FUNCTION dump(columnname TEXT, tablename TEXT,
                                rowid INT) RETURNS TEXT
LANGUAGE plpgsql
AS
$function$
DECLARE 
  hex_str TEXT;
  hex_byte TEXT;
  query TEXT;
  i INT;
  len INT;
BEGIN
  query := format('SELECT octet_length(%I) FROM %I offset %s
                   LIMIT 1', columnname, tablename, rowid);
  EXECUTE query INTO len;
  hex_str := 'Len=' || len || ' ';
  FOR i in 1..len LOOP
    query := format('SELECT to_hex(get_byte(%I::bytea, %s)) 
                     FROM %I offset %s LIMIT 1', 
                     columnname, i - 1, tablename, rowid);
    EXECUTE query INTO hex_byte;
    hex_str := hex_str || hex_byte || ',';
  END LOOP;
  RETURN hex_str;
END;
$function$;


select dump('description', 'varchar_demo', 0);
                         dump                         
------------------------------------------------------
 Len=15 e3,81,93,e3,82,93,e3,81,ab,e3,81,a1,e3,81,af,
(1 row)

select dump('description', 'varchar_demo', 1);
         dump          
-----------------------
 Len=5 41,42,43,44,45,
(1 row)

Comparing NVARCHAR2 with VARCHAR2 and NCLOB with CLOB in Oracle

In Oracle by default VARCHAR2/CLOB data is encoded using UTF-8, that can result in a character take up to 4 bytes of storage. However NVARCHAR/NCLOB data is encoded in UTF-16, that results in a character taking 2 bytes each.

First consider the relevant portion of the UTF-8 encoding table along with UNICODE

-----------------------------------------------------
Unicode  Character   UTF-8       Character Name
-----------------------------------------------------
U+3053     こ     e3 81 93 HIRAGANA LETTER KO
U+3093    ん     e3 82 93 HIRAGANA LETTER N
U+306B    に     e3 81 ab HIRAGANA LETTER NI
U+3061    ち     e3 81 a1 HIRAGANA LETTER TI
U+306F    は     e3 81 af HIRAGANA LETTER HA
-----------------------------------------------------

The following example provides the details for varchar2 & nvarchar2
1. Check the relevant database parameters

SELECT * FROM nls_database_parameters WHERE PARAMETER = 'NLS_NCHAR_CHARACTERSET';
-----------------------------------------------------
NLS_NCHAR_CHARACTERSET AL16UTF16
-----------------------------------------------------

SELECT * FROM nls_database_parameters WHERE parameter = 'NLS_CHARACTERSET';
-----------------------------------------------------
NLS_CHARACTERSET AL32UTF8
-----------------------------------------------------

2. Create the VARCHAR2 table

CREATE TABLE varchar2_demo (description VARCHAR2(50));
INSERT INTO varchar2_demo VALUES('こんにちは');
INSERT INTO varchar2_demo VALUES('ABCDE');
COMMIT;

3. Check the data and the encoded byte stream

SELECT description, DUMP(description,1016) FROM varchar2_demo;
--------------------------------------------------------------
こんにちは Typ=1 Len=15 CharacterSet=AL32UTF8: e3,81,93,e3,82,93,e3,81,ab,e3,81,a1,e3,81,af
ABCDE  Typ=1 Len=5 CharacterSet=AL32UTF8: 41,42,43,44,45
--------------------------------------------------------------

4. Create the NVARCHAR2 table

CREATE TABLE nvarchar2_demo (description NVARCHAR2(50));

INSERT INTO nvarchar2_demo VALUES('こんにちは');
INSERT INTO nvarchar2_demo VALUES('ABCDE');
COMMIT;

5. Check the data and the encoded byte stream

SELECT description, DUMP(description,1016) FROM nvarchar2_demo;
--------------------------------------------------------
こんにちは    Typ=1 Len=10 CharacterSet=AL16UTF16: 30,53,30,93,30,6b,30,61,30,6f
ABCDE     Typ=1 Len=10 CharacterSet=AL16UTF16: 0,41,0,42,0,43,0,44,0,45
---------------------------------------------------------

The results of step 4 & 5 show that each character took either 1 or 3 bytes for VARCHAR2 whereas each character took 2 bytes for NVARCHAR2.

The following example provides the details for CLOB & NCLOB


6. Create the CLOB & NCLOB tables

CREATE TABLE clob_demo (id integer, description clob);
INSERT INTO clob_demo VALUES(1, 'こんにちは');
INSERT INTO clob_demo VALUES(2, 'ABCDE');

CREATE TABLE nclob_demo (id integer, description nclob);
INSERT INTO nclob_demo VALUES(1, 'こんにちは');
INSERT INTO nclob_demo VALUES(2, 'ABCDE');

COMMIT;

7. Check the data in the tables

SELECT * FROM clob_demo;
---------------------
1 こんにちは
2 ABCDE
---------------------

SELECT * FROM nclob_demo;
---------------------
1 こんにちは
2 ABCDE
---------------------

8. Check the encoded byte stream of first row in CLOB table

declare
  cursor c1 is select description clb from clob_demo where id = 1;
  c1rec         c1%ROWTYPE;
  dest_offset   integer;
  src_offset    integer;
  lang_context  integer;
  warning       varchar2(1000);
  blb           blob;
  buf           raw(200);
  blb_len       integer;
  blb_offset    integer;
begin
  open c1;
  fetch c1 into c1rec;
  close c1;
  dbms_lob.createtemporary(blb, FALSE);
  dest_offset := 1;
  src_offset := 1;
  lang_context := 0;
  dbms_lob.converttoblob(blb, c1rec.clb, dbms_lob.getlength(c1rec.clb),
                    dest_offset, src_offset, 0, lang_context, warning);
  blb_len := dbms_lob.getlength(blb);
  blb_offset := 1;
  dbms_lob.read(blb, blb_len, blb_offset, buf);
  dbms_output.put_line(blb_len);
  dbms_output.put_line(buf);
end;
/
-------------------------------------------------------
15
E3 81 93 E3 82 93 E3 81 AB E3 81 A1 E3 81 AF
-------------------------------------------------------

9. Check the encoded byte stream of second row in CLOB table

declare
  cursor c1 is select description clb from clob_demo where id = 2;
  c1rec         c1%ROWTYPE;
  dest_offset   integer;
  src_offset    integer;
  lang_context  integer;
  warning       varchar2(1000);
  blb           blob;
  buf           raw(200);
  blb_len       integer;
  blb_offset    integer;
begin
  open c1;
  fetch c1 into c1rec;
  close c1;
  dbms_lob.createtemporary(blb, FALSE);
  dest_offset := 1;
  src_offset := 1;
  lang_context := 0;
  dbms_lob.converttoblob(blb, c1rec.clb, dbms_lob.getlength(c1rec.clb),
                    dest_offset, src_offset, 0, lang_context, warning);
  blb_len := dbms_lob.getlength(blb);
  blb_offset := 1;
  dbms_lob.read(blb, blb_len, blb_offset, buf);
  dbms_output.put_line(blb_len);
  dbms_output.put_line(buf);
end;
/
--------------
5
41 42 43 44 45
--------------

10. Check the encoded byte stream of first row in NCLOB table

declare
  cursor c1 is select description clb from nclob_demo where id = 1;
  c1rec         c1%ROWTYPE;
  dest_offset   integer;
  src_offset    integer;
  lang_context  integer;
  warning       varchar2(1000);
  blb           blob;
  buf           raw(200);
  blb_len       integer;
  blb_offset    integer;
begin
  open c1;
  fetch c1 into c1rec;
  close c1;
  dbms_lob.createtemporary(blb, FALSE);
  dest_offset := 1;
  src_offset := 1;
  lang_context := 0;
  dbms_lob.converttoblob(blb, c1rec.clb, dbms_lob.getlength(c1rec.clb),
                    dest_offset, src_offset, 0, lang_context, warning);
  blb_len := dbms_lob.getlength(blb);
  blb_offset := 1;
  dbms_lob.read(blb, blb_len, blb_offset, buf);
  dbms_output.put_line(blb_len);
  dbms_output.put_line(buf);
end;
/
-------------------------------------------------------
10
30 53 30 93 30 6B 30 61 30 6F
-------------------------------------------------------

11. Check the encoded byte stream of second row in NCLOB table

declare
  cursor c1 is select description clb from nclob_demo where id = 2;
  c1rec         c1%ROWTYPE;
  dest_offset   integer;
  src_offset    integer;
  lang_context  integer;
  warning       varchar2(1000);
  blb           blob;
  buf           raw(200);
  blb_len       integer;
  blb_offset    integer;
begin
  open c1;
  fetch c1 into c1rec;
  close c1;
  dbms_lob.createtemporary(blb, FALSE);
  dest_offset := 1;
  src_offset := 1;
  lang_context := 0;
  dbms_lob.converttoblob(blb, c1rec.clb, dbms_lob.getlength(c1rec.clb),
                    dest_offset, src_offset, 0, lang_context, warning);
  blb_len := dbms_lob.getlength(blb);
  blb_offset := 1;
  dbms_lob.read(blb, blb_len, blb_offset, buf);
  dbms_output.put_line(blb_len);
  dbms_output.put_line(buf);
end;
/
------------------------------------------
10
00 41 00 42 00 43 00 44 00 45
------------------------------------------

The output of step 8,9,10 & 11 show that each data row took 10 bytes for NCLOB, whereas first data row took 15 bytes whereas the second took 5 bytes for CLOB.