前文

AI辨认必定是未来的干流技术,数据库作为后端的要害中心服务,必须紧跟年代潮流。pgvector向量数据库横空出世,它能够把物体以多种特征变量存储在数据库里边,而且经过简单的SQL完成,对类似物体进行比较承认。

根据postgresql的一个扩展插件,与postgresql强强联手能够完成向量数据的存储、核算。

imgsmlr同样也是根据postgresql的一个扩展插件,它能够把非结构化文件转向成辨认二进制数据,二进制再转换成特征数据,最终为转换成二进制数据做衬托。流程如图,结果是咱们从图片中辨认出来的真龙, 千万图片丛中过,我只找归于我自己的真龙。

2024 ,pgvector怎么使你龙年辨认真龙

环境准备

装置 postgreSQL14

我的环境装置包如下,留意libs和devel都必须要有

[root@server128 ~]# rpm -qa |grep -i postgres
postgresql14-14.10-1PGDG.rhel7.x86_64
postgresql-libs-9.2.24-9.el7_9.x86_64
postgresql14-libs-14.10-1PGDG.rhel7.x86_64
postgresql14-server-14.10-1PGDG.rhel7.x86_64
postgresql14-devel-14.10-1PGDG.rhel7.x86_64

装置imgsmlr

git clone --depth 1 https://github.com/postgrespro/imgsmlr 
 
 
cd imgsmlr/ 
USE_PGXS=1 make   履行报错,见下图
USE_PGXS=1 make install 
​

2024 ,pgvector怎么使你龙年辨认真龙

解决方法如下,在/usr/pgsql-14/include/server/access/gist.h的259行方位加入以下注释,即能够成功编译经过USE_PGXS=1 make

#ifndef FALSE
#define FALSE  (0)
#endif#ifndef TRUE
#define TRUE   (!FALSE)
#endif

装置pgvector

git clone  https://github.com/pgvector/pgvector.git
cd pgvector
make
make install

准备语句

#img_raw装载图片
create table  img_raw(
rawid int primary key,
info  text,
ts timestamp,
raw  bytea
);
​
#img_vec装载图片的特征数据
create table  img_vec(
id int primary key references img_raw(rawid),
patt  pattern,
sig signature
);
​
# 根据img_vec表构建gist
create index on  img_vec using  gist(sig);
​
​
​
#刺进图片信息insert into img_raw values (1, 'long1,png', now(), pg_read_binary_file('/img/long1.jpg')); 
insert into img_raw values (2, 'long2,jpg', now(), pg_read_binary_file('/img/long2.jpg')); 
insert into img_raw values (3, 'long3,jpg', now(), pg_read_binary_file('/img/long3.jpg')); 
insert into img_raw values (4, 'long4,jpg', now(), pg_read_binary_file('/img/long4.jpg')); 
insert into img_raw values (5, 'long5,jpg', now(), pg_read_binary_file('/img/long5.jpg')); 
insert into img_raw values (6, 'long6,jpg', now(), pg_read_binary_file('/img/long6.jpg')); 
insert into img_raw values (7, 'long7,jpg', now(), pg_read_binary_file('/img/long7.jpg')); 
insert into img_raw values (8, 'long8,jpg', now(), pg_read_binary_file('/img/long8.jpg')); 
insert into img_raw values (9, 'long9,jpg', now(), pg_read_binary_file('/img/long9.jpg')); 
insert into img_raw values (10, 'long10,jpg', now(), pg_read_binary_file('/img/long10.jpg')); 
insert into img_raw values (11, 'long11,jpg', now(), pg_read_binary_file('/img/long11.jpg')); 
​
insert into img_raw values (12, 'chang1,jpg', now(), pg_read_binary_file('/img/chang1.jpg')); 
insert into img_raw values (13, 'hu1,jpg', now(), pg_read_binary_file('/img/hu1.jpg')); 
insert into img_raw values (14, 'mao1,jpg', now(), pg_read_binary_file('/img/mao1.jpg')); 
insert into img_raw values (15, 'she1,jpg', now(), pg_read_binary_file('/img/she1.jpg')); 
insert into img_raw values (16, 'tian1,jpg', now(), pg_read_binary_file('/img/tian1.jpg')); 
insert into img_raw values (17, 'wo1,jpg', now(), pg_read_binary_file('/img/wo1.jpg')); 
insert into img_raw values (18, 'xiong1,jpg', now(), pg_read_binary_file('/img/xiong1.jpg')); 
insert into img_raw values (19, 'tu1,jpg', now(), pg_read_binary_file('/img/tu1.jpg')); 
insert into img_raw values (20, 'tu2,jpg', now(), pg_read_binary_file('/img/tu2.jpg')); 
insert into img_raw values (21, 'tu3,jpg', now(), pg_read_binary_file('/img/tu3.jpg'));
​
​
#将图片提取一切特征保存img_vec表里边
insert into img_vec select rawid, jpeg2pattern(raw), pattern2signature(jpeg2pattern(raw)) from img_raw;
​
#img_vec表添加向量embedding
alter table img_vec add column embedding vector;
​
​
# img_vec表的特征数据转化为向量数据
update img_vec set embedding = replace(replace(sig::text, '(', '{'), ')', '}')::float[]::vector; 
​
​
​
select  
   t1.id, 
   t2.info, 
   embedding <-> (select embedding from img_vec where id=9) as "欧氏类似间隔" 
 from img_vec t1 join img_raw t2 on t1.id=t2.rawid 
  order by embedding <-> (select embedding from img_vec where id=9)
  
  
  
​

实践:福龙与狗 、熊、兔、蜗牛、植物射手

用自己拍照21张,针对福龙从不同角度拍了10张,与及其它各式小动物共11张,在postgresql转换成向量数据后,用运用欧氏算法、内积算法、余弦算法来进行核算

为了证明这个是龙,看它的标签

2024 ,pgvector怎么使你龙年辨认真龙

2024 ,pgvector怎么使你龙年辨认真龙

以long9作为规范图,核算其它图片与它的类似度
​
#用运用欧氏算法,<->来进行核算
postgres=# select
postgres-#   t1.id,
postgres-#   t2.info,
postgres-#   embedding <-> (select embedding from img_vec where id=9) as "欧氏类似间隔"
postgres-#  from img_vec t1 join img_raw t2 on t1.id=t2.rawid
postgres-#   order by embedding <-> (select embedding from img_vec where id=9);
 id |   info   |   欧氏类似间隔
---- ------------ --------------------
  9 | long9,jpg  |          0
  4 | long4,jpg  | 0.8131946015532012
 14 | mao1,jpg  |  0.873532153956551
  5 | long5,jpg  | 0.9437004404760417
  3 | long3,jpg  |  1.006888032114946
  2 | long2,jpg  | 1.0538826025705688
 17 | wo1,jpg   | 1.0584338866296883
  8 | long8,jpg  | 1.1284268427144581
  1 | long1,png  | 1.1660293246028335
 10 | long10,jpg | 1.1743752827626244
  7 | long7,jpg  | 1.1849544245168537
 11 | long11,jpg | 1.2636647526394387
 18 | xiong1,jpg |  1.28623207769336
 16 | tian1,jpg  | 1.2928207635465576
  6 | long6,jpg  |  1.415787966613027
 15 | she1,jpg  |  1.472345617909073
 12 | chang1,jpg | 1.5688576102868548
 13 | hu1,jpg   | 1.5765029578406513
 19 | tu1,jpg   | 1.6046681136433938
 21 | tu3,jpg   | 1.9848909683454863
 20 | tu2,jpg   |  1.996557491652661
(21 rows)
​
​
#内积算法  <#>来进行核算
postgres=# select
postgres-#   t1.id,
postgres-#   t2.info,
postgres-#   embedding <#> (select embedding from img_vec where id=9) as "内积类似间隔"
postgres-#  from img_vec t1 join img_raw t2 on t1.id=t2.rawid
postgres-#   order by embedding <#> (select embedding from img_vec where id=9)  ;
 id |   info   |   内积类似间隔
---- ------------ ---------------------
  9 | long9,jpg  |  -9.793889999389648
 15 | she1,jpg  |  -9.56818962097168
  5 | long5,jpg  |  -9.520377159118652
 10 | long10,jpg |  -9.336055755615234
  4 | long4,jpg  |  -9.27934455871582
  8 | long8,jpg  |  -9.063756942749023
 20 | tu2,jpg   |  -9.029650688171387
  1 | long1,png  |  -8.906010627746582
 11 | long11,jpg |  -8.879608154296875
  3 | long3,jpg  |  -8.859291076660156
  2 | long2,jpg  |  -8.828414916992188
  7 | long7,jpg  |  -8.590826034545898
  6 | long6,jpg  |  -8.51888656616211
 14 | mao1,jpg  |  -8.475362777709961
 17 | wo1,jpg   | -7.8920063972473145
 12 | chang1,jpg |  -7.692102432250977
 16 | tian1,jpg  |  -7.567811965942383
 13 | hu1,jpg   |   -6.4823317527771
 18 | xiong1,jpg |  -6.195792198181152
 19 | tu1,jpg   |  -5.272591590881348
 21 | tu3,jpg   |   -4.3834547996521
(21 rows)
​
#余弦算法  <=> 来进行核算
postgres=# select
postgres-#   t1.id,
postgres-#   t2.info,
postgres-#   embedding <=> (select embedding from img_vec where id=9) as "余弦类似间隔"
postgres-#  from img_vec t1 join img_raw t2 on t1.id=t2.rawid
postgres-#   order by embedding <=> (select embedding from img_vec where id=9);
 id |   info   |   余弦类似间隔
---- ------------ ----------------------
  9 | long9,jpg  |           0
  4 | long4,jpg  |   0.034229251128992
 14 | mao1,jpg  |  0.03767662614530476
 18 | xiong1,jpg | 0.039896219019039614
  5 | long5,jpg  |  0.04453997228870388
  3 | long3,jpg  |  0.05313360470652673
 17 | wo1,jpg   |  0.05427983467295261
  2 | long2,jpg  |  0.05828036923202573
  8 | long8,jpg  |  0.06559011022972738
 10 | long10,jpg |  0.06853261342194783
  1 | long1,png  |  0.07069949063544745
  7 | long7,jpg  |  0.07420155647983717
 19 | tu1,jpg   |  0.07621919722304038
 11 | long11,jpg |  0.08243281628522736
 16 | tian1,jpg  |  0.08686050822976776
 15 | she1,jpg  |  0.09882522468462784
  6 | long6,jpg  |  0.1048961893306749
 13 | hu1,jpg   |  0.12904889503224415
 12 | chang1,jpg |  0.13378521110088515
 20 | tu2,jpg   |  0.17567972259493136
 21 | tu3,jpg   |  0.1793031852332665
(21 rows)
​
​
​

上面的核算结果不抱负,反复查看,最终发现long9右边有鼓起的红包,特征搜集钱包的信息,我把long9改成long6,重新运转欧氏类似间隔核算,这样发现前5名都是long的搜集结果,而且long6与long9离得老远!

2024 ,pgvector怎么使你龙年辨认真龙

实践:龙与猫

我在网络上随机摘了6个猫的图,以及5个龙的图,这下子能够确认相片来自于不同的拍照东西、不同的相片格局、不同的编码方式、不同的分辨率、不同的巨细。

2024 ,pgvector怎么使你龙年辨认真龙

​
​
postgres=#  select
postgres-#   t1.id,
postgres-#   t2.info,
postgres-#   embedding <-> (select embedding from img_vec where id=9) as "欧氏类似间隔"
postgres-#  from img_vec t1 join img_raw t2 on t1.id=t2.rawid
postgres-#   order by embedding <-> (select embedding from img_vec where id=9)
postgres-#   limit 20;
 id |  info  |   欧氏类似间隔
---- ---------- --------------------
 9 | d3,jpg  |         0
 7 | d1,jpeg | 0.7647856568512812
 8 | d2,jpeg | 1.2676685459702477
 5 | cat4,png | 1.3949615071226016
 6 | cat5,jpg | 1.464445502504794
 3 | cat3,jpg | 1.7297577441389917
 11 | d5,jpg  | 2.441273726871975
 1 | cat1,png | 2.449192466777335
 2 | cat2,jpg | 2.623407880151245
 10 | d4,jpeg | 2.8941867638822063
 4 | cat4,jpg | 4.956624524660588
(11 rows)
​
postgres=#  select
postgres-#   t1.id,
postgres-#   t2.info,
postgres-#   embedding <#> (select embedding from img_vec where id=9) as "内积类似间隔"
postgres-#  from img_vec t1 join img_raw t2 on t1.id=t2.rawid
postgres-#   order by embedding <#> (select embedding from img_vec where id=9)
postgres-#   limit 20;
 id |  info  |   内积类似间隔
---- ---------- ---------------------
 11 | d5,jpg  | -13.741656303405762
 10 | d4,jpeg | -12.781399726867676
 1 | cat1,png | -11.762970924377441
 2 | cat2,jpg | -9.446988105773926
 5 | cat4,png | -8.874711036682129
 9 | d3,jpg  |  -8.14848518371582
 4 | cat4,jpg | -8.092750549316406
 7 | d1,jpeg |  -7.85388708114624
 3 | cat3,jpg | -6.499306678771973
 8 | d2,jpeg | -5.019601821899414
 6 | cat5,jpg | -4.994093418121338
(11 rows)
​
postgres=#  select
postgres-#   t1.id,
postgres-#   t2.info,
postgres-#   embedding <=> (select embedding from img_vec where id=9) as "余弦类似间隔"
postgres-#  from img_vec t1 join img_raw t2 on t1.id=t2.rawid
postgres-#   order by embedding <=> (select embedding from img_vec where id=9)
postgres-#   limit 20;
 id |  info  |   余弦类似间隔
---- ---------- ----------------------
 9 | d3,jpg  |          0
 7 | d1,jpeg | 0.03589936156946816
 11 | d5,jpg  | 0.042836060244666685
 8 | d2,jpeg | 0.05975859196041422
 5 | cat4,png | 0.08507819244719383
 1 | cat1,png |  0.1087175221334512
 10 | d4,jpeg | 0.11832410643368074
 6 | cat5,jpg | 0.12351988272197956
 3 | cat3,jpg |  0.1869637379378707
 2 | cat2,jpg |  0.2117638205685053
 4 | cat4,jpg |  0.5035052630227678
(11 rows)

上面定论来看,这来自五湖四海不同的图片运用三种算法,余弦间隔核算的功用最好,前面三名都是龙有关的

定论

  • imgsmlr和pgvector是好东西,可是要留意图片的特征值的抽取,例如同样的手机拍照 ,因为 辨认目标旁边放的东西,向量化核算时会大打折扣。

  • imgsmlr和pgvector虽好,可是没有能够监测的调试环境,怎么改动特征值?怎么在内部对图画进行观测?现在还没有这个功用

  • pgvector今后在AI大模型范畴必定大有作为,现在辨认虽有差错,可是辨认有80%以上的经过 , 今后进一步的发展, 经过后端与前端的合力,必定能够辨认100%的真龙。

    最终预祝各位同学龙年遇真龙,龙年大乐!