SIGN IN SIGN UP

Azure Data SQL Samples - Official Microsoft GitHub Repository containing code samples for SQL Server, Azure SQL, Azure Synapse, and Azure SQL Edge

0 0 9 Markdown
2019-11-21 09:38:37 -08:00
----------------------------------------------
-- UTF-8 Functional
----------------------------------------------
USE master;
GO
DROP DATABASE IF EXISTS MyNonUtf8Database;
DROP DATABASE IF EXISTS MyUtf8Database;
DROP DATABASE IF EXISTS MyFormerlyUnicodeOnlyDatabase;
DROP DATABASE IF EXISTS MaskingDatabase;
--
-- Create a database that is NOT collated with UTF-8.
-- This demonstrates that you can insert Unicode data into VARCHAR columns collated with UTF-8.
--
CREATE DATABASE MyNonUtf8Database COLLATE SQL_Latin1_General_CP1_CI_AI;
GO
USE MyNonUtf8Database;
GO
CREATE TABLE MyUtf8Table (datakind VARCHAR(100), data VARCHAR(8000) COLLATE Latin1_General_100_CI_AS_SC_UTF8);
GO
INSERT INTO MyUtf8Table
VALUES ('ASCII - 1 byte per character', N'Thequickbrownfoxjumpsoverthelazydog'),
('Cyrillic - 2 bytes per character', N'Быстраякоричневаялисапрыгаетчерезленивуюсобаку'),
('Far East - 3 bytes per character', N'敏捷的棕色狐狸跳过了懒狗'),
('Emojis - 4 bytes per character', N'👶👦👧👨👩👴👵👨👩👨👩👨👩👨👩'),
('Emojis with Variation Selector - 6 bytes per glyph', N'⚕️⚖️↔︎↕︎↖︎↗︎↘︎↙︎↩︎↪︎↔️↕️↖️↗️↘️↙️↩️↪️'),
('Ashi with Supplementary Variation Selector - 7 bytes per glyph', N'芦󠄀芦󠄁芦󠄂芦󠄃芦󠄄芦󠄅芦󠄆芦󠄇芦󠄈芦󠄉芦󠄃芦󠄂芦󠄁芦󠄀芦󠄁芦󠄂芦󠄃芦󠄄芦󠄈芦󠄉');
GO
SELECT datakind, data
FROM MyUtf8Table;
GO
-- This demo used the N' syntax, as string literals are always collated in the collation
-- of the currently active database.
--
-- Create a database collated with UTF-8.
-- This is to demonstrate that now string literals can be used without N'',
-- as string literals are collated with the database collation, and can hold any characters.
--
CREATE DATABASE MyUtf8Database COLLATE Lithuanian_100_CS_AI_WS_SC_UTF8;
GO
USE MyUtf8Database;
GO
CREATE TABLE MyTableWithInheritedCollation (datakind VARCHAR(100), data VARCHAR(8000));
GO
INSERT INTO MyTableWithInheritedCollation
VALUES ('ASCII - 1 byte per character', 'Thequickbrownfoxjumpsoverthelazydog'),
('Cyrillic - 2 bytes per character', 'Быстраякоричневаялисапрыгаетчерезленивуюсобаку'),
('Far East - 3 bytes per character', '敏捷的棕色狐狸跳过了懒狗'),
('Emojis - 4 bytes per character', '👶👦👧👨👩👴👵👨👩👨👩👨👩👨👩'),
('Emojis with Variation Selector - 6 bytes per glyph', '⚕️⚖️↔︎↕︎↖︎↗︎↘︎↙︎↩︎↪︎↔️↕️↖️↗️↘️↙️↩️↪️'),
('Ashi with Supplementary Variation Selector - 7 bytes per glyph', '芦󠄀芦󠄁芦󠄂芦󠄃芦󠄄芦󠄅芦󠄆芦󠄇芦󠄈芦󠄉芦󠄃芦󠄂芦󠄁芦󠄀芦󠄁芦󠄂芦󠄃芦󠄄芦󠄈芦󠄉');
GO
SELECT datakind, data
FROM MyTableWithInheritedCollation;
GO
--
-- Create a collation prefixed with formerly Unicode-only collation (not having its own Windows code page).
-- You can do it now.
--
CREATE DATABASE GonnaFailDueToUnicodeOnlyCollation COLLATE Lao_100_CS_AS_KS_WS_SC;
GO
CREATE DATABASE MyFormerlyUnicodeOnlyDatabase COLLATE Lao_100_CS_AS_KS_WS_SC_UTF8;
GO
USE MyFormerlyUnicodeOnlyDatabase;
GO
CREATE TABLE MyFormerlyUnicodeOnlyTable (datakind VARCHAR(100), data VARCHAR(8000));
GO
INSERT INTO MyFormerlyUnicodeOnlyTable (datakind, data)
SELECT datakind, data
FROM MyNonUtf8Database..MyUtf8Table;
GO
SELECT datakind, data
FROM MyFormerlyUnicodeOnlyTable;
GO
--
-- Demo of one orthogonality feature - data masking
--
CREATE DATABASE MaskingDatabase COLLATE Chinese_PRC_90_CI_AI_SC_UTF8;
GO
USE MaskingDatabase;
GO
CREATE user ToBeKeptAway without LOGIN;
GO
CREATE TABLE KeepAway (top_secret_data VARCHAR(8000) COLLATE Mapudungan_100_CS_AS_SC_UTF8 masked
WITH (FUNCTION = 'partial(2, "💩💩💩💩💩", 2)'));
GO
INSERT INTO KeepAway (top_secret_data)
SELECT data
FROM MyNonUtf8Database..MyUtf8Table;
GO
GRANT SELECT
ON KeepAway
TO ToBeKeptAway;
GO
EXECUTE AS user = 'ToBeKeptAway';
SELECT top_secret_data
FROM KeepAway;
REVERT;
GO
----------------------------
/*
See how many bytes each character requires for both UTF-8 and UTF-16 encodings.
Returns all 65,536 BMP (Base Multilingual Plan) characters (which is also the entire UCS-2 character set), and 3 Supplementary Characters.
Since all Supplementary Characters are 4 bytes in both encodings, there is no need to return more of them, but we do need to see a few of them to see that they are:
a) all 4 bytes
b) encoded slightly differently
*/
;
WITH nums ([CodePoint])
AS (
SELECT TOP (65536) (
ROW_NUMBER() OVER (
ORDER BY (
SELECT 0
)
) - 1
)
FROM [master].[sys].[columns] col
CROSS JOIN [master].[sys].[objects] obj
), chars
AS (
SELECT nums.[CodePoint], CONVERT(VARCHAR(4), NCHAR(nums.[CodePoint]) COLLATE Latin1_General_100_CI_AS_SC_UTF8) AS [TheChar], CONVERT(VARBINARY(4), CONVERT(VARCHAR(4), NCHAR(nums.[CodePoint]) COLLATE Latin1_General_100_CI_AS_SC_UTF8)) AS [UTF8]
FROM nums
UNION ALL
SELECT tmp.val, CONVERT(VARCHAR(4), CONVERT(NVARCHAR(5), tmp.hex) COLLATE Latin1_General_100_CI_AS_SC_UTF8) AS [TheChar], CONVERT(VARBINARY(4), CONVERT(VARCHAR(4), CONVERT(NVARCHAR(5), tmp.hex) COLLATE Latin1_General_100_CI_AS_SC_UTF8)) AS [UTF8]
FROM (
VALUES (65536, 0x00D800DC), -- Linear B Syllable B008 A (U+10000)
(67618, 0x02D822DC), -- Cypriot Syllable Pu (U+10822)
(129384, 0x3ED868DD) -- Pretzel (U+1F968)
) tmp(val, hex)
)
SELECT chr.[CodePoint], COALESCE(chr.[TheChar], N'TOTALS:') AS [Character], chr.[UTF8] AS [UTF8_Hex], DATALENGTH(chr.[UTF8]) AS [UTF8_Bytes], COUNT(CASE DATALENGTH(chr.[UTF8]) WHEN 1 THEN 'x' END) AS [1-byte], COUNT(CASE DATALENGTH(chr.[UTF8]) WHEN 2 THEN 'x' END) AS [2-bytes], COUNT(CASE DATALENGTH(chr.[UTF8]) WHEN 3 THEN 'x' END) AS [3-bytes], COUNT(CASE DATALENGTH(chr.[UTF8]) WHEN 4 THEN 'x' END) AS [4-bytes],
---
CONVERT(VARBINARY(4), CONVERT(NVARCHAR(3), chr.[TheChar])) AS [UTF16(LE)_Hex], DATALENGTH(CONVERT(NVARCHAR(3), chr.[TheChar])) AS [UTF16_Bytes],
---
((DATALENGTH(CONVERT(NVARCHAR(3), chr.[TheChar]))) - (DATALENGTH(chr.[TheChar]))) AS [UTF8savingsOverUTF16]
FROM chars chr
GROUP BY ROLLUP((chr.[CodePoint], chr.[TheChar], chr.[UTF8]));