Skip to content

Commit

Permalink
refactor: 资讯处理格式
Browse files Browse the repository at this point in the history
fix: 部分学院资讯为空
  • Loading branch information
msojocs committed Sep 10, 2021
1 parent 4022cf7 commit e2e874f
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 69 deletions.
5 changes: 3 additions & 2 deletions src/main/java/cn/wecuit/backen/controller/NewsController.java
Original file line number Diff line number Diff line change
Expand Up @@ -281,11 +281,11 @@ private String getNewsContent(String link) throws IOException, ParseException {
put("jsjxy.cuit.edu.cn", "//body/div[2]/div[2]/div/div/div/form/div/div/div/div");
put("kzgcxy.cuit.edu.cn", "//*[@id=\"vsb_newscontent\"]");
put("rjgcxy.cuit.edu.cn", "//body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/table/tbody/tr[2]/td/form/table");
put("txgcxy.cuit.edu.cn", "//body/div[2]/div/div[2]/div[2]/form");
put("txgcxy.cuit.edu.cn", "//body/div[4]/div[2]/div/div[2]/div/form/table");
put("wgyxy.cuit.edu.cn", "//*[@id=\"vsb_content\"]");
put("cyber.cuit.edu.cn", "//body/div[3]/div[2]/div[2]/form/div");
put("math.cuit.edu.cn", "//body/div[4]/div/div[2]/div/div/div/div/table/tbody/tr/td");
put("hjgcx.cuit.edu.cn", "//body/table[3]/tbody/tr[1]/td[3]/table[2]/tbody/tr/td/form/table");
put("hjgcx.cuit.edu.cn", "//body/div[4]/div/div[2]/ul/table/tbody/tr[2]/td/form/table/tbody");
put("qkl.cuit.edu.cn", "//body/div[4]/div/div[2]/div[2]/form/div");
put("jwc.cuit.edu.cn", "//body/nav[3]/form/div");
put("dzgcxy.cuit.edu.cn", "//body/table/tbody/tr[4]/td/table/tbody/tr/td[4]/table/tbody/tr[3]/td/table/tbody/tr/td/form/table");
Expand Down Expand Up @@ -323,6 +323,7 @@ public String doReplace(String text, int index, Matcher matcher) {
* @return String 新闻主体html
* @throws IOException 流异常 [来自HTTP请求处理]
*/
@Deprecated
private String getHomeContent(String link) throws IOException, ParseException {
link = link.replace("http://", "https://").replace(".aspx", "");
String html = HttpUtil.doGet(link);
Expand Down
16 changes: 15 additions & 1 deletion src/main/java/cn/wecuit/backen/entity/News.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class News {

private List<Map<String,String>> tags;

private String pattern;
private PatternType pattern;

private String uriExp;

Expand All @@ -38,4 +38,18 @@ public String toString() {
", pullVer=" + pullVer +
'}';
}

@Data
public static
class PatternType{
private String rule;
private PatternPos pos;

@Data
public static class PatternPos{
private Integer title;
private Integer link;
private Integer date;
}
}
}
138 changes: 72 additions & 66 deletions src/main/java/cn/wecuit/backen/services/impl/NewsServiceImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -188,17 +188,19 @@ private Map<String, Object> v1_list(String tag, int page) {

String body = HttpUtil.doGet(uri);

Pattern compile = Pattern.compile(news.getPattern());
News.PatternType pattern = news.getPattern();
News.PatternType.PatternPos pos = pattern.getPos();
Pattern compile = Pattern.compile(pattern.getRule());
Matcher matcher = compile.matcher(body);

ret.put("next", body.contains("class=\"Next\">下页</a>"));

Map<String, String> jo;
while (matcher.find()) {
jo = new HashMap<>();
jo.put("date", matcher.group(3).replaceAll("/", "-").replaceAll("\\[|]", ""));
jo.put("title", matcher.group(2));
jo.put("link", matcher.group(1));
jo.put("date", matcher.group(pos.getDate()).replaceAll("/", "-").replaceAll("\\[|]", ""));
jo.put("title", matcher.group(pos.getTitle()));
jo.put("link", matcher.group(pos.getLink()));
if (!matcher.group(2).contains("党"))
list.add(jo);
}
Expand Down Expand Up @@ -268,22 +270,23 @@ private Map<String, Object> v2_list(String tag, String page) {
} else {
ret.put("next", null);
}

News.PatternType pattern = news.getPattern();
News.PatternType.PatternPos pos = pattern.getPos();
// 解析列表
compile = Pattern.compile(news.getPattern());
compile = Pattern.compile(pattern.getRule());
matcher = compile.matcher(body);

Map<String, String> jo;
while (matcher.find()) {
// 真实路径处理
String link = link_pre + matcher.group(1);
String link = link_pre + matcher.group(pos.getLink());
URL url = new URL(link);
link = url.getPath() + (url.getQuery() != null ? "?" + url.getQuery() : "");
link = getRealPath(link);

jo = new HashMap<>();
jo.put("date", matcher.group(3).replaceAll("/", "-").replaceAll("\\[|]", ""));
jo.put("title", matcher.group(2));
jo.put("date", matcher.group(pos.getDate()).replaceAll("/", "-").replaceAll("\\[|]", ""));
jo.put("title", matcher.group(pos.getTitle()));
jo.put("link", link);
if (!matcher.group(2).contains("党"))
list.add(jo);
Expand All @@ -301,59 +304,6 @@ private Map<String, Object> v2_list(String tag, String page) {
return ret;
}

// 版本三
public void v3_pull() {
String path = this.dir + "/" + news.getSource();
File folder = new File(path);

if (!folder.exists() && !folder.isDirectory()) {
System.out.println(folder.mkdirs());
}

news.getTags().forEach(o -> {
String name = o.get("name");
try {
Map<String, Object> v3_list = v3_list(name);
v3_list.put("name", news.getName());
FileUtil.WriteFile(path + "/" + name + "_1.json", JsonUtil.obj2String(v3_list));
} catch (IOException | ParseException e) {
e.printStackTrace();
}
o.put("total", "1");
});

FileUtil.WriteFile(path + "/tags.json", JsonUtil.obj2String(news.getTags()));
}

private Map<String, Object> v3_list(String tag) throws IOException, ParseException {
String html = HttpUtil.doGet("https://www.cuit.edu.cn/NewsList?id=" + tag);
html = html.replaceAll("\r\n", "").replaceAll("\n", "").replaceAll("\r", "");

List<Map<String, String>> list = new LinkedList<>();
Map<String, Object> ret = new HashMap<String, Object>() {{
put("domain", "www.cuit.edu.cn");
put("list", list);
}};

JXDocument jxDocument = JXDocument.create(html);
List<JXNode> jxNodes = jxDocument.selN("//*[@id=\"NewsListContent\"]/li");
jxNodes.forEach(e -> {
Element element = e.asElement();
String title = element.child(1).text();
String link = element.child(1).attr("href");
String date = element.child(2).text().replaceAll("/", "-").replaceAll("\\[|]", "");

if (!title.contains("党"))
list.add(new HashMap<String, String>() {{
put("title", title);
put("link", link);
put("date", date);
}});
});

return ret;
}

// 版本四
public void v4_pull() {
String path = this.dir + "/" + news.getSource();
Expand Down Expand Up @@ -410,20 +360,22 @@ private Map<String, Object> v4_list(String tag, String page) {
ret.put("next", null);
}

News.PatternType pattern = news.getPattern();
News.PatternType.PatternPos pos = pattern.getPos();
// 解析列表
compile = Pattern.compile(news.getPattern());
compile = Pattern.compile(pattern.getRule());
matcher = compile.matcher(body);

Map<String, String> jo;
while (matcher.find()) {
// 真实路径处理
String link = link_pre + matcher.group(2);
String link = link_pre + matcher.group(pos.getLink());
link = new URL(link).getPath();
link = getRealPath(link);

jo = new HashMap<>();
jo.put("date", matcher.group(1).replaceAll("/", "-").replaceAll("\\[|]", ""));
jo.put("title", matcher.group(3));
jo.put("date", matcher.group(pos.getDate()).replaceAll("/", "-").replaceAll("\\[|]", ""));
jo.put("title", matcher.group(pos.getTitle()));
jo.put("link", link);
if (!matcher.group(3).contains("党"))
list.add(jo);
Expand All @@ -450,4 +402,58 @@ private String getRealPath(String filename) {
return filename;
}

// // 版本三
// @Deprecated
// public void v3_pull() {
// String path = this.dir + "/" + news.getSource();
// File folder = new File(path);
//
// if (!folder.exists() && !folder.isDirectory()) {
// System.out.println(folder.mkdirs());
// }
//
// news.getTags().forEach(o -> {
// String name = o.get("name");
// try {
// Map<String, Object> v3_list = v3_list(name);
// v3_list.put("name", news.getName());
// FileUtil.WriteFile(path + "/" + name + "_1.json", JsonUtil.obj2String(v3_list));
// } catch (IOException | ParseException e) {
// e.printStackTrace();
// }
// o.put("total", "1");
// });
//
// FileUtil.WriteFile(path + "/tags.json", JsonUtil.obj2String(news.getTags()));
// }
// @Deprecated
// private Map<String, Object> v3_list(String tag) throws IOException, ParseException {
// String html = HttpUtil.doGet("https://www.cuit.edu.cn/NewsList?id=" + tag);
// html = html.replaceAll("\r\n", "").replaceAll("\n", "").replaceAll("\r", "");
//
// List<Map<String, String>> list = new LinkedList<>();
// Map<String, Object> ret = new HashMap<String, Object>() {{
// put("domain", "www.cuit.edu.cn");
// put("list", list);
// }};
//
// JXDocument jxDocument = JXDocument.create(html);
// List<JXNode> jxNodes = jxDocument.selN("//*[@id=\"NewsListContent\"]/li");
// jxNodes.forEach(e -> {
// Element element = e.asElement();
// String title = element.child(1).text();
// String link = element.child(1).attr("href");
// String date = element.child(2).text().replaceAll("/", "-").replaceAll("\\[|]", "");
//
// if (!title.contains("党"))
// list.add(new HashMap<String, String>() {{
// put("title", title);
// put("link", link);
// put("date", date);
// }});
// });
//
// return ret;
// }

}

1 comment on commit e2e874f

@msojocs
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refactor: 资讯处理格式

fix: 部分学院资讯为空
#6

Please sign in to comment.