1 package net.sf.flock.parser;
2
3 import java.text.SimpleDateFormat;
4 import java.util.Date;
5 import java.util.Iterator;
6 import java.util.List;
7
8 import net.sf.flock.FeedFactoryI;
9 import net.sf.flock.FeedI;
10 import net.sf.flock.FlockResourceException;
11 import net.sf.flock.SubscriptionInfoI;
12
13 import org.apache.log4j.LogManager;
14 import org.apache.log4j.Logger;
15 import org.jdom.Attribute;
16 import org.jdom.Document;
17 import org.jdom.Element;
18 import org.jdom.Namespace;
19
20 public class Rss20Parser implements FeedParserI {
21
22 private final static Logger LOGGER = LogManager.getLogger(Rss20Parser.class);
23
24 private SimpleDateFormat timeFormat = new SimpleDateFormat("yyyy.MM.dd H:mm");
25
26 /***
27 * @see net.sf.flock.parser.FeedParserI#isSuitable(Document)
28 */
29 public boolean isSuitable(Document doc) {
30 Element root = doc.getRootElement();
31 if (!"rss".equals(root.getName())) {
32 return false;
33 }
34 Attribute rssVersion = root.getAttribute("version");
35 if (rssVersion==null || rssVersion.getValue()==null) {
36 return false;
37 }
38 LOGGER.debug("Found RSS version " + rssVersion.getValue());
39 return rssVersion.getValue().startsWith("2.0");
40 }
41
42 /***
43 * @see net.sf.flock.parser.FeedParserI#parse(FeedFactoryI,Document)
44 */
45 public FeedI parse(SubscriptionInfoI subscriptionInfoI,FeedFactoryI feedFactory, Document doc) throws FlockResourceException {
46
47 Date dateParsed = new Date();
48 LOGGER.debug("start parsing.");
49
50 Element root = doc.getRootElement();
51
52 // Get the channel element (only one occurs)
53 Element channel = root.getChild("channel");
54
55 // 1 title element
56 FeedI feed = feedFactory.createFeed(subscriptionInfoI);
57
58 feed.setTitle( HTMLUtil.unescape( channel.getChildTextTrim("title") ) );
59
60 // 1 description element
61 // !!! feed.setDescription(channel.getChildTextTrim("description"));
62
63 // 1 link element
64 feed.setSite(ParserUtil.getURL(channel.getChildTextTrim("link")));
65
66 // 1 language element
67 // !!! feed.setLanguage(channel.getChildTextTrim("language"));
68
69 // 1..n item elements
70 List items = channel.getChildren("item");
71 Iterator i = items.iterator();
72 while (i.hasNext()) {
73 Element item = (Element) i.next();
74
75 // get description element
76 Element elDesc = item.getChild("description");
77 String strDesc = "";
78 if (elDesc != null) {
79 strDesc = elDesc.getTextTrim();
80 }
81
82 // get link element
83 Element elLink = item.getChild("link");
84 String strLink = "";
85 if (elLink != null) {
86 strLink = elLink.getTextTrim();
87 LOGGER.debug("found link '"+strLink+"'");
88 } else {
89 // try 'guid'
90 // ... http://radio.weblogs.com/0112015/rss.xml
91 elLink = item.getChild("guid");
92 if (elLink != null) {
93 strLink = elLink.getTextTrim();
94 LOGGER.debug("found guid '"+strLink+"'");
95 } else {
96 LOGGER.debug("no link/guid");
97 }
98
99 }
100
101
102
103 Date pubDate = ParserUtil.parseDate(item.getChild("pubDate"));
104
105 if (pubDate==null) {
106 // try dc namespace
107 Namespace dcNS = ParserUtil.getNamespace(root, "dc");
108 if (dcNS!=null) {
109 pubDate = ParserUtil.parseDate( item.getChild("date", dcNS) );
110 }
111 }
112
113 LOGGER.debug("date :"+pubDate);
114
115
116 Date creationTime = pubDate!=null ? pubDate : dateParsed;
117 // !!! item.setFound(dateParsed);
118
119 Element elTitle = item.getChild("title");
120 LOGGER.debug("title '"+(elTitle==null ? "null" : elTitle.getTextTrim()) +"'");
121 String title =
122 elTitle!=null ? HTMLUtil.unescape( elTitle.getTextTrim() ) :
123 feed.getTitle() + " " + this.timeFormat.format(creationTime);
124
125 feed.newItem(creationTime, title, strDesc, ParserUtil.getURL(strLink));
126 }
127
128 // 0..1 rating element
129
130 // 0..1 image element
131 /*
132 Element image = channel.getChild("image");
133 if (image != null) {
134 ImageIF rssImage =
135 cBuilder.makeImage(image.getChildTextTrim("title"),
136 getURL(image.getChildTextTrim("url")));
137 Element imgWidth = image.getChild("width");
138 if (imgWidth != null) {
139 try {
140 rssImage.setWidth(Integer.parseInt(imgWidth.getTextTrim()));
141 } catch (NumberFormatException e) {
142 logger.warn(e);
143 }
144 }
145 Element imgHeight = image.getChild("height");
146 if (imgHeight != null) {
147 try {
148 rssImage.setHeight(Integer.parseInt(imgHeight.getTextTrim()));
149 } catch (NumberFormatException e) {
150 logger.warn(e);
151 }
152 }
153 Element imgDescr = image.getChild("description");
154 if (imgDescr != null) {
155 rssImage.setDescription(imgDescr.getTextTrim());
156 }
157 chnl.setImage(rssImage);
158 }
159
160 // 0..1 textinput element
161 Element txtinp = channel.getChild("textinput");
162 if (txtinp != null) {
163 TextInputIF rssTextInput =
164 cBuilder.makeTextInput(txtinp.getChild("title").getTextTrim(),
165 txtinp.getChild("description").getTextTrim(),
166 getURL(txtinp.getChild("link").getTextTrim()));
167 chnl.setTextInput(rssTextInput);
168 }
169
170 // 0..1 copyright element
171 Element copyright = channel.getChild("copyright");
172 if (copyright != null) {
173 chnl.setCopyright(copyright.getTextTrim());
174 }
175
176 // 0..1 pubDate element
177 Element pubDate = channel.getChild("pubDate");
178 // if (pubDate != null) {
179 // chnl.setPubDate(pubDate.getTextTrim());
180 // }
181
182 // 0..1 lastBuildDate element
183 Element lastBuildDate = channel.getChild("lastBuildDate");
184 // if (lastBuildDate != null) {
185 // chnl.setLastBuildDate(lastBuildDate.getTextTrim());
186 // }
187
188 // 0..1 docs element
189
190 // 0..1 managingEditor element
191 Element managingEditor = channel.getChild("managingEditor");
192 if (managingEditor != null) {
193 chnl.setCreator(managingEditor.getTextTrim());
194 }
195
196 // 0..1 webMaster element
197 Element webMaster = channel.getChild("webMaster");
198 if (webMaster != null) {
199 chnl.setPublisher(webMaster.getTextTrim());
200 }
201
202 // 0..1 skipHours element
203 // 0..1 skipDays element
204 */
205 return feed;
206 }
207
208 }
This page was automatically generated by Maven