티스토리 뷰

IT/JAVA

[JAVA]RSS 파싱(parsing) Source Code

Kanzler 2016. 10. 24. 16:32

RSS(Rich Site Summary)는 뉴스나 블로그 사이트에서 주로 사용하는 콘텐츠 표현 방식입니다. 웹사이트 관리자는 RSS형식으로 웹사이트의 내용을 표시하고 이 정보를 가지고 편리하게 이용 할 수 있게 되었습니다.

일일히 해당 사이트에 방문하여 새글이나 원하는 정보를 수집하는것이라 아니라 RSS를 이용해 원하는 정보를 쉽게 자동수집이 가능하게 되었습니다. 이번 포스팅에서는 이런 RSS를 읽고 파싱하는 자바 소스코드에 대해서 설명 할려고 합니다.


국내에서 운영 되는 거의 대부분의 매체(네이버,다음등의 포탈, 각종 언론,신문사)에서는 RSS를 제공 하고 있습니다. 여기에서는 네이버 뉴스를 예를들어 설명하도록 하겠습니다. RSS 2.0을 기준으로 처리되기 때문에 RSS 2.0 표준을 지키는 RSS는 모두 동일하게 적용 가능 합니다. RSS는 XML형태로 제공 됩니다.


네이버 검색어 "자동차"로 검색된 RSS URL :

http://newssearch.naver.com/search.naver?where=rss&query=%EC%9E%90%EB%8F%99%EC%B0%A8&field=0&nx_search_query=&nx_and_query=&nx_sub_query=&nx_search_hlquery=



1. Feed class


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import java.util.ArrayList;
import java.util.List;
 
public class Feed {
    /**
     * 타이틀
     */
    final String title;
    
    /**
     * 링크 url
     */
    final String link;
    
    /**
     * 설명
     */
    final String description;
    
    /**
     * 언어
     */
    final String language;
    
    /**
     *  저작권정보
     */
    final String copyright;
    
    /**
     * 일자
     */
    final String pubDate;
 
    final List<FeedMessage> entries = new ArrayList<FeedMessage>();
 
    public Feed(String title, String link, String description, String language,
                    String copyright, String pubDate) {
            this.title = title;
            this.link = link;
            this.description = description;
            this.language = language;
            this.copyright = copyright;
            this.pubDate = pubDate;
    }
 
    public List<FeedMessage> getMessages() {
            return entries;
    }
 
    public String getTitle() {
            return title;
    }
 
    public String getLink() {
            return link;
    }
 
    public String getDescription() {
            return description;
    }
 
    public String getLanguage() {
            return language;
    }
 
    public String getCopyright() {
            return copyright;
    }
 
    public String getPubDate() {
            return pubDate;
    }
 
    @Override
    public String toString() {
            return "Feed [copyright=" + copyright + ", description=" + description
                            + ", language=" + language + ", link=" + link + ", pubDate="
                            + pubDate + ", title=" + title + "]";
    }
 
}
cs



2.FeedMessage class


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
public class FeedMessage {
    
    /**
     * 아이템 이름
     */
     String title;
     
     /**
      * 아이템 설명
      */
     String description;
     
     /**
      * 아이템 link url
      */
     String link;
     
     /**
      * 아이템 작성장
      */
     String author;
     
     /**
      * 아이템 기본키 개념의 유일한 아이디값
      */
     String guid;
     
     /**
      * 작성시간
      */
     String pubdate;
 
     public String getTitle() {
             return title;
     }
 
     public void setTitle(String title) {
             this.title = title;
     }
 
     public String getDescription() {
             return description;
     }
 
     public void setDescription(String description) {
             this.description = description;
     }
 
     public String getLink() {
             return link;
     }
 
     public void setLink(String link) {
             this.link = link;
     }
 
     public String getAuthor() {
             return author;
     }
 
     public void setAuthor(String author) {
             this.author = author;
     }
 
     public String getGuid() {
             return guid;
     }
 
     public void setGuid(String guid) {
             this.guid = guid;
     }
 
     public String getPubdate() {
          return pubdate;
     }
 
     public void setPubdate(String pubdate) {
         this.pubdate = pubdate;
     }
 
    @Override
    public String toString() {
        return "FeedMessage [title=" + title + ", description=" + description + ", link=" + link + ", author=" + author
                + ", guid=" + guid + ", pubdate=" + pubdate + "]";
    }
 
    
}
cs



3.RSSFeedParser class



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
 
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.XMLEvent;
 
public class RSSFeedParser {
        static final String TITLE = "title";
        static final String DESCRIPTION = "description";
        static final String CHANNEL = "channel";
        static final String LANGUAGE = "language";
        static final String COPYRIGHT = "copyright";
        static final String LINK = "link";
        static final String AUTHOR = "author";
        static final String ITEM = "item";
        static final String PUB_DATE = "pubDate";
        static final String GUID = "guid";
        static final String PUBDATE ="pubdate";
 
        final URL url;
 
        public RSSFeedParser(String feedUrl) {
                try {
                        this.url = new URL(feedUrl);
                } catch (MalformedURLException e) {
                        throw new RuntimeException(e);
                }
        }
 
        public Feed readFeed() {
                Feed feed = null;
                try {
                        boolean isFeedHeader = true;
                        // Set header values intial to the empty string
                        String description = "";
                        String title = "";
                        String link = "";
                        String language = "";
                        String copyright = "";
                        String author = "";
                        String pubdate = "";
                        String guid = "";
 
                        // First create a new XMLInputFactory
                        XMLInputFactory inputFactory = XMLInputFactory.newInstance();
                        // Setup a new eventReader
                        InputStream in = read();
                        XMLEventReader eventReader = inputFactory.createXMLEventReader(in);
                        // read the XML document
                        while (eventReader.hasNext()) {
                                XMLEvent event = eventReader.nextEvent();
                                if (event.isStartElement()) {
                                        String localPart = event.asStartElement().getName()
                                                        .getLocalPart();
                                        switch (localPart) {
                                        case ITEM:
                                                if (isFeedHeader) {
                                                        isFeedHeader = false;
                                                        feed = new Feed(title, link, description, language,
                                                                        copyright, pubdate);
                                                }
                                                event = eventReader.nextEvent();
                                                break;
                                        case TITLE:
                                                title = getCharacterData(event, eventReader);
                                                break;
                                        case DESCRIPTION:
                                                description = getCharacterData(event, eventReader);
                                                break;
                                        case LINK:
                                                link = getCharacterData(event, eventReader);
                                                break;
                                        case GUID:
                                                guid = getCharacterData(event, eventReader);
                                                break;
                                        case LANGUAGE:
                                                language = getCharacterData(event, eventReader);
                                                break;
                                        case AUTHOR:
                                                author = getCharacterData(event, eventReader);
                                                break;
                                        case PUB_DATE:
                                                pubdate = getCharacterData(event, eventReader);
                                                break;
                                        case COPYRIGHT:
                                                copyright = getCharacterData(event, eventReader);
                                                break;
                                        case PUBDATE:
                                            pubdate = getCharacterData(event, eventReader);
                                            break;
                                                
                                        }
                                } else if (event.isEndElement()) {
                                        if (event.asEndElement().getName().getLocalPart() == (ITEM)) {
                                                FeedMessage message = new FeedMessage();
                                                message.setAuthor(author);
                                                message.setDescription(description);
                                                message.setGuid(guid);
                                                message.setLink(link);
                                                message.setTitle(title);
                                                message.setPubdate(pubdate);
                                                feed.getMessages().add(message);
                                                event = eventReader.nextEvent();
                                                continue;
                                        }
                                }
                        }
                } catch (XMLStreamException e) {
                        throw new RuntimeException(e);
                }
                return feed;
        }
 
        private String getCharacterData(XMLEvent event, XMLEventReader eventReader)
                        throws XMLStreamException {
                String result = "";
                event = eventReader.nextEvent();
                if (event instanceof Characters) {
                        result = event.asCharacters().getData();
                }
                return result;
        }
 
        private InputStream read() {
                try {
                        return url.openStream();
                } catch (IOException e) {
                        throw new RuntimeException(e);
                }
        }
}
cs



4.Test the code 


1
2
3
4
5
6
7
8
RSSFeedParser parser = new RSSFeedParser(
                         "http://newssearch.naver.com/search.naver?where=rss&query=%EC%9E%90%EB%8F%99%EC%B0%A8&field=0&nx_search_query=&nx_and_query=&nx_sub_query=&nx_search_hlquery=");
         Feed feed = parser.readFeed();
         System.out.println(feed);
         for (FeedMessage message : feed.getMessages()) {
                 System.out.println(message);
        
         }
cs



5.결과




기본적인 RSS URL을 가지고 XML을 파싱하여 해당 정보를 가져오는 소스 코드입니다. Test시 RSS주소만 변경 해준후 사용 하시면 됩니다. 

출처 : http://www.vogella.com/tutorials/RSSFeed/article.html


댓글