课外天地 李树青学习天地信息检索原理课件 → 可以计算PageRank等操作的Java代码和相应存储过程


  共有21639人关注过本帖树形打印复制链接

主题:可以计算PageRank等操作的Java代码和相应存储过程

帅哥哟,离线,有人找我吗?
admin
  1楼 博客 | 信息 | 搜索 | 邮箱 | 主页 | UC


加好友 发短信 管理员
等级:管理员 帖子:1940 积分:26616 威望:0 精华:34 注册:2003/12/30 16:34:32
可以计算PageRank等操作的Java代码和相应存储过程  发帖心情 Post By:2008/5/6 5:37:38 [只看该作者]

一)数据库准备

1、将WebPageInfo的URL字段设置为小于900,并且对其建立索引:
create index urlIndex on WebPageInfo(URL)

2、将AnchorUrl的URL字段和ANCHORURL字段设置为小于900,并建立索引:
CREATE  INDEX [UrlAndAnchor] ON [dbo].[AnchorUrl]([URL], [ANCHORURL]) ON [PRIMARY]
GO

3、建立其他表资源
CREATE TABLE [IndexInfo] (
        [WORDID] [int] IDENTITY (1, 1) NOT NULL ,
        [KEYWORDS] [char] (1000) COLLATE Chinese_PRC_CI_AS NULL ,
        [DOCID] [int] NULL ,
        [POSITION] [int] NULL ,
        [PR] [float] NULL ,
        CONSTRAINT [PK_IndexInfo] PRIMARY KEY  CLUSTERED
        (
                [WORDID]
        )  ON [PRIMARY]
) ON [PRIMARY]
GO

CREATE TABLE [CharInfo] (
        [CHARID] [int] IDENTITY (1, 1) NOT NULL ,
        [CHAR] [char] (2) COLLATE Chinese_PRC_CI_AS NULL ,
        [DOCID] [int] NULL ,
        [POSITION] [int] NULL ,
        [PR] [float] NULL ,
        CONSTRAINT [PK_CharInfo] PRIMARY KEY  CLUSTERED
        (
                [CHARID]
        )  ON [PRIMARY]
) ON [PRIMARY]
GO


CREATE  INDEX [IX_Char] ON [dbo].[CharInfo]([CHAR]) ON [PRIMARY]
GO

4、建立填充title的存储过程
create proc getTitles
as
begin
        declare webpagecursor cursor
        for SELECT substring(pagetext,1,4000) FROM WebPageInfo

        declare @ptext nchar(4000)
        open webpagecursor
        fetch next from webpagecursor into @ptext
        while (@@fetch_status=0)
        begin
                declare @start int
                declare @end int
                set @ptext=lower(@ptext)
                set @start=CHARINDEX('<title>',@ptext,1)
                set @end=CHARINDEX('</title>',@ptext,1)
                print @start
                print @end
                UPDATE WebPageInfo SET title =substring(@ptext,@start+7,@end-@start-7) WHERE CURRENT OF webpagecursor          
                fetch next from webpagecursor into @ptext
        end
        close webpagecursor
        deallocate webpagecursor
end

5、建立初始化PageRank值的存储过程
create proc initializePr
as
begin
        declare @value float
        select @value=count(*) from WebPageInfo
        update WebPageInfo set pr=1/@value
end

6、建立准备计算PageRank的存储过程
CREATE proc prepareTempPr
as
begin
        if exists (select * from sysobjects where id = object_id('dbo.temppr') and sysstat & 0xf = 3)
        begin
                drop table "dbo"."temppr"
        end
        
        if exists (select * from sysobjects where id = object_id('dbo.prdata') and sysstat & 0xf = 3)
        begin
                drop table "dbo"."prdata"
        end

        select AnchorUrl.docid,count(*) as 'count',min(pr) as 'pr' into temppr from AnchorUrl inner join WebPageInfo on WebPageInfo.docid= AnchorUrl.docid group by AnchorUrl.docid
        select anchorurl.docid,anchorurl.url,anchorurl.anchordocid, anchorurl.anchorurl,count ,pr into prdata from anchorurl inner join temppr on anchorurl.docid=temppr.docid
end
go

7、建立计算PageRank的存储过程
create proc cumputePR
as
begin  
        set nocount on
        
        DECLARE @totalCount int
        set @totalCount=0
        WHILE @totalCount<7
        BEGIN
        
                declare prcursor cursor
                for select docid from temppr
                
                declare @num int
                open prcursor
                fetch next from prcursor into @num
                while (@@fetch_status=0)
                begin
                        declare prcursor2 cursor
                        for select rtrim(url),pr,[count] from prdata where rtrim(anchordocid)=@num
                        declare @url char(200)  
                        declare @prvalue float
                        declare @count int
                        
                        open prcursor2
                        declare @pagerank float
                        set @pagerank=0.0
                        fetch next from prcursor2 into @url,@prvalue,@count
                        while (@@fetch_status=0)
                        begin  
                                set @pagerank=@pagerank+@prvalue/@count
                                fetch next from prcursor2 into @url,@prvalue,@count
                        end
                        print @pagerank
                        update temppr set pr=pr+@pagerank where docid=@num
                        close prcursor2
                        deallocate prcursor2
                        print @num
                        print '+++++++++++++'
                        fetch next from prcursor into @num
                end
                
                declare @newprvalue float
                select @newprvalue=sum(pr) from temppr
                update temppr set pr=pr/@newprvalue
                update prdata set pr=(select pr from temppr where temppr.docid=prdata.docid)
                
                close prcursor
                deallocate prcursor
                
                set @totalCount=@totalCount+1
        END
end

update WebPageInfo set pr=(select pr from temppr where temppr.docid=WebPageInfo.docid)

GO

8、建立提取关键词的存储过程
create proc getKeywords
as begin
        declare kwcursor cursor
        for SELECT docid,substring(pagetext,1,4000) FROM WebPageInfo

        declare @did int
        declare @ptext nchar(4000)
        open kwcursor
        fetch next from kwcursor into @did,@ptext
        while (@@fetch_status=0)
        begin
                declare @start int
                declare @end int
                set @start=0
                set @end=0
                while(1=1)
                begin
                        set @start=CHARINDEX('>',@ptext,@end)
                        set @end=CHARINDEX('<',@ptext,@start)
                        if @start=0 or @end=0
                                break
                        declare @str char(1000)
                        set @str=substring(@ptext,@start+1,@end-@start-1)
                        if ascii(ltrim(rtrim(@str)))!=13 and ascii(ltrim(rtrim(@str)))!=10 and ltrim(rtrim(@str))!=''
                                insert into IndexInfo(KEYWORDS,DOCID,[POSITION]) values(@str,@did,@start)
                end
                fetch next from kwcursor into @did,@ptext
        end
        close kwcursor
        deallocate kwcursor

        update IndexInfo set pr=(select pr from WebPageInfo where WebPageInfo.docid=IndexInfo.docid)
end

go

9、建立全文索引的存储过程
CREATE proc getChar
as
begin  
        set nocount on
        
                declare indexcursor cursor
                for select wordid,keywords,docid,position,pr from IndexInfo
                
                declare @wid int
                declare @word char(1000)
                declare @did int
                declare @pos int
                declare @pr float

                open indexcursor
                fetch next from indexcursor into @wid,@word,@did,@pos,@pr
                while (@@fetch_status=0)
                begin
                        declare @count int
                        set @count=1
                        while(@count<=len(@word))
                        begin
                                declare @chinword char(2)
                                set @chinword=substring(@word,@count,1)
                                if(@chinword!=' ')
                                        insert into CharInfo([char],docid,position,pr) values(@chinword,@did,@pos+@count,@pr)
                                set @count=@count+1
                        end
                        fetch next from indexcursor into @wid,@word,@did,@pos,@pr
                end
                
                close indexcursor
                deallocate indexcursor

                drop index CharInfo.IX_CharInfo
                CREATE  UNIQUE  INDEX [IX_CharInfo] ON [dbo].[CharInfo]([CHARID]) ON [PRIMARY]
end
GO

10、执行process程序

二)Process程序

import java.sql.*;
import java.sql.*;

//运行此程序,无需任何额外准备操作,但是以前的IndexInfo和CharInfo表记录没有自动删除
public class Process
{
        public static Connection con;
        
        //填充标题字段
        public static void fillTitles()
        {
                CallableStatement cstm;
                try
                {
                        Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
                con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
                        cstm=con.prepareCall("{call getTitles}");              
                        cstm.execute();
                        cstm.close();
                        con.close();  
                }
                catch(Exception e)
                {
                        System.out.println(e.getMessage());
                }
        }        
        
        //填充ANCHORDOCID
        public static void fillAnchorDocID()
        {
                Statement stm;
                try
                {
                        Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
                con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
                        stm=con.createStatement();              
                        stm.executeUpdate("update anchorurl set anchordocid=(select docid from WebPageInfo where url=anchorurl.anchorurl)");
                        con.close();  
                }
                catch(Exception e)
                {
                        System.out.println(e.getMessage());
                }      
        }
          
        public static void computePageRankOfAllPages()
        {
                //初始化pagerank值
                CallableStatement cstm;
                try
                {
                        Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
                con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
                        cstm=con.prepareCall("{call initializePr}");            
                        cstm.execute();
                        cstm.close();
                        con.close();  
                }
                catch(Exception e)
                {
                        System.out.println(e.getMessage());
                }
                
                //构造临时表tempPr
                try
                {
                        Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
                con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
                        cstm=con.prepareCall("{call prepareTempPr}");          
                        cstm.execute();
                        cstm.close();
                        cstm=con.prepareCall("{call cumputePR}");              
                        cstm.execute();
                        cstm.close();
                        con.close();  
                }
                catch(Exception e)
                {
                        System.out.println(e.getMessage());
                }      
        }
        
        public static void getCharInKeyword()
        {
                //生成关键词序列和词语序列
                CallableStatement cstm;
                try
                {
                        Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
                        con = DriverManager.getConnection("jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=SE","sa","");
                        cstm=con.prepareCall("{call getKeywords}");            
                        cstm.execute();
                        cstm.close();
                        cstm=con.prepareCall("{call getChar}");        
                        cstm.execute();
                        cstm.close();
                        con.close();  
                }
                catch(Exception e)
                {
                        System.out.println(e.getMessage());
                }
        }
                
        public static void main(String [] args)
        {
                Process.fillTitles();
                Process.fillAnchorDocID();
                Process.computePageRankOfAllPages();
                //Process.getCharInKeyword();
        }
}

[此贴子已经被作者于2010-12-14 09:26:07编辑过]

 回到顶部