以文本方式查看主题

-  课外天地 李树青  (http://www.njcie.com/bbs/index.asp)
--  信息检索原理课件  (http://www.njcie.com/bbs/list.asp?boardid=16)
----  结合SQL Server进行的搜狗部分公开数据的实验方法  (http://www.njcie.com/bbs/dispbbs.asp?boardid=16&id=473)

--  作者:admin
--  发布时间:2008/4/27 7:30:38
--  结合SQL Server进行的搜狗部分公开数据的实验方法

1、用户查询日志

1、准备代码
CREATE DATABASE [UserQuery]  ON (NAME = N\'UserQuery_Data\', FILENAME = N\'C:\\Program Files\\Microsoft SQL Server\\MSSQL\\data\\UserQuery_Data.MDF\' , SIZE = 1, FILEGROWTH = 10%) LOG ON (NAME = N\'UserQuery_Log\', FILENAME = N\'C:\\Program Files\\Microsoft SQL Server\\MSSQL\\data\\UserQuery_Log.LDF\' , SIZE = 1, FILEGROWTH = 10%)
COLLATE Chinese_PRC_CI_AS
GO

CREATE TABLE [LogFile] (
[用户] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[查询词] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[数值] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL  ,
[点击URL] [varchar] (500) COLLATE Chinese_PRC_CI_AS NULL ,
)
GO

CREATE TABLE [LogInfo] (
[id] [int] IDENTITY (1, 1) NOT NULL ,
[用户] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[查询词] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[URL结果排名] [int] NULL ,
[点击序号] [int] NULL ,
[点击URL] [varchar] (500) COLLATE Chinese_PRC_CI_AS NULL ,
CONSTRAINT [PK_TABLE1] PRIMARY KEY  CLUSTERED
(
  [id]
)  ON [PRIMARY]
) ON [PRIMARY]
GO

[LogFile] 生成 [LogInfo]
insert into loginfo select [用户],substring([查询词],2,len([查询词])-2),CONVERT(int, substring([数值],1,charindex(\' \',[数值])-1)),CONVERT(int, substring([数值],charindex(\' \',[数值])+1,10)),[点击URL] from logfile

可以加上各个字段的索引

2、分析
1)查看用户查询情况
select top 100 * from loginfo order by [用户],[点击序号]

2)查看用户会话情况
select top 100 用户,count(*) from loginfo group by 用户 order by count(*) desc

select count(distinct 用户) from loginfo

select * from loginfo where 用户=\'13505142608748166\'

select * from loginfo where 用户=\'13505142608748166\' order by 查询词,点击序号

select 用户 from loginfo group by 用户 having max(点击序号)=count(*) order by count(*)

select 用户 from loginfo group by 用户 having max(点击序号)=count(*) and count(*)>100 order by count(*)


3)查询词语情况
select top 100 查询词,count(*) from loginfo group by 查询词  order by count(*) desc
select count(distinct 查询词) from loginfo
select * from loginfo where 查询词=\'张玉凤\' order by 用户

select avg(len(查询词)) from loginfo
select len(查询词),count(*) from loginfo group by len(查询词) order by count(*) desc

select top 100 查询词,count(distinct 用户) from loginfo group by 查询词  order by count(distinct 用户) desc

4)查询浏览结果的情况
select URL结果排名,count(*) from loginfo group by URL结果排名 order by count(*) desc
select 点击序号,count(*) from loginfo group by 点击序号 order by count(*) desc

select 点击序号,count(*) as \'count\' into tempt from loginfo group by 点击序号 order by count(*) desc
select round(count,-4),count(点击序号) from tempt group by round(count,-4) order by count(点击序号) desc
select * from tempt

2、词库分析

1、数据准备
CREATE TABLE [lexiconTemp] (
[词语] [char] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[词频] [int] NULL ,
[词性] [char] (20) COLLATE Chinese_PRC_CI_AS NULL ,

)
GO


添加主键即可

2、分析
1)查看词语分析情况
select len(词语),count(*) from lexicon group by len(词语) order by count(*) desc
select len(词频),count(*) from lexicon group by len(词频) order by count(*) desc
select 词性,count(*) from lexicon group by 词性   order by count(*) desc


2)查看不同词语的词性分布情况
select 词性,count(*) from lexicon where len(词语)=2 group by 词性 order by count(*) desc
select 词性,count(*) from lexicon where 词频<100000 group by 词性 order by count(*) desc

3、词语组合分析

1、数据准备
CREATE TABLE [WordPair] (
[词语组合] [char] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[词频] [int] NULL
) ON [PRIMARY]
GO

2、分析
1)出现最多的组合
select top 1000 * from wordpair order by 词频 desc

大家可以自行练习,并做出自己的实验分析结果



 

[此贴子已经被作者于2010-12-14 09:21:56编辑过]