在Google查詢時看到這個字數統計網站,覺得有趣來玩玩SQL方式
https://www.ifreesite.com/wordcount/
在這個網站查詢字碼表範圍~
https://blog.miniasp.com/post/2019/01/02/Common-Regex-patterns-for-Unicode-characters
得知要的字碼範圍如下:
中文區段 19968~40959
英文大寫 65~90
英文小寫 97~122
數字 48~57
隨便你用哪個SQL@@~我是使用MSSQL
declare @Tmp table(
StrTmp nvarchar(max)
)
insert into @Tmp
values(N'你知道嗎?中秋節是2021/09/21(二),但2021/09/20(一)也是放假日唷')
,(N'Do you know that Mid-Autumn Festival is 2021/09/21 (Tuesday), but 2021/09/20 (Monday) is also a holiday.')
我的方式是這樣~
用Unicode編號查範圍~
select CharType
,Count(0) Num
from (
select (
case when UNICODE(SUBSTRING(StrTmp,Sort,1)) between 65 and 90
or UNICODE(SUBSTRING(StrTmp,Sort,1)) between 97 and 122
then '英文'
when UNICODE(SUBSTRING(StrTmp,Sort,1)) between 19968 and 40959 then '中文'
when UNICODE(SUBSTRING(StrTmp,Sort,1)) between 48 and 57 then '數字'
else '其他'
end
) CharType
from @Tmp
,(
select Row_Number()Over(order by [number]) as Sort
from master..spt_values
) as k
where Sort <= len(StrTmp)
) k
group by CharType
order by Count(0) desc
後來看到一級屠豬士用正規化~我就改了一下SQL查詢方式~
但中文的正規化帶不出來= =||
select CharType
,Count(0) Num
from (
select (
case when PATINDEX('[a-zA-Z]',SUBSTRING(StrTmp,Sort,1)) > 0 then '英文'
when UNICODE(SUBSTRING(StrTmp,Sort,1)) between 19968 and 40959 then '中文'
when PATINDEX('[0-9]',SUBSTRING(StrTmp,Sort,1)) > 0 then '數字'
else '其他'
end
) CharType
from @Tmp
,(
select Row_Number()Over(order by [number]) as Sort
from master..spt_values
) as k
where Sort <= len(StrTmp)
) k
group by CharType
order by Count(0) desc
我用[\u4e00-\u9fff]範圍是沒有的@@a
create table it210708 (
id int
, txt text
);
insert into it210708 values
(1, '你知道嗎?中秋節是2021/09/21(二),但2021/09/20(一)也是放假日唷')
,(2, 'Do you know that Mid-Autumn Festival is 2021/09/21 (Tuesday), but 2021/09/20 (Monday) is also a holiday.');
with t1(single) as (
select unnest(string_to_array(txt, NULL))
from it210708
), t2(chrtype) as (
select case
when single ~ '[\x4e00-\x9fff\x3400-\x4dbf]' then '中文'
when single ~ '\d' then '數字'
when single ~ '[a-zA-Z]' then '英文'
else '其他'
end
from t1
)
select chrtype
, count(*)
from t2
group by 1;
chrtype | count
---------+-------
英文 | 62
中文 | 17
數字 | 32
其他 | 36
(4 rows)