Python Python 开发

怎么使用python取出一个目录下面所有文件的指定内容

0

需求：
1.在指定目录有约500个的Java文件；
2.需要提取出每个文件中特定的内容,需要去重；
3.最终生成文本文件或excel文件；
4.提取的内容是从每个方法里面提取出包含有 collection = MongoUtil. 这个的内容；
5.需要每个方法里面的collection = MongoUtil. 这个内容对应；
6.500个文件的内容最终输出到一个文件内；

例:
下面这个文件(IndexInfoService.java)需要提取的数据是这个样子的：
IndexInfoService.java
getIndexBasicInfo（方法1）
DBCollection collection = MongoUtil.getGGf10dbCollection("t_f10_index_basicinfo")
collection = MongoUtil.getGGStockBaseCollection("gg_cihdquote")
collection = MongoUtil.getGGf10dbCollection("t_f10_index_industry_code")
-----------------------------------------------------------------------------------------
getIndustryStockCodes（方法2）
DBCollection collection = MongoUtil.getGGStockBaseCollection("stock_sw_industry")
-----------------------------------------------------------------------------------------
getIndexMarketNews（方法3）
DBCollection collection = MongoUtil.getGGf10dbCollection("t_f10_index_industry_code")
-----------------------------------------------------------------------------------------
getConsensusExpecData（方法4）
DBCollection collection = MongoUtil.getGGf10dbCollection("t_f10_index_industry_code")

文件IndexInfoService.java:
package f10service.v1.index.service;

import ggframework.bottom.store.mongodb.AggregationOutput;
import ggframework.bottom.store.mongodb.BasicDBObject;
import ggframework.bottom.store.mongodb.DBCollection;
import ggframework.bottom.store.mongodb.DBCursor;
import ggframework.bottom.store.mongodb.DBObject;
import ggframework.bottom.store.mongodb.GGDBCursor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.elasticsearch.common.collect.ImmutableMap;
import org.elasticsearch.common.collect.Maps;
import org.h2.expression.Aggregate;

import com.google.common.collect.Lists;

import ggf10service.common.DateUtil;
import ggf10service.common.HtmlUtils;
import ggf10service.common.MongoUtil;

public class IndexInfoService {
   /**
   *
   * 方法描述指数概括
   *
   * @param indexCode
   * @return
   * @date 2017年9月11日下午1:42:19
   */
   public static Map<String, Object> getIndexBasicInfo(String indexCode){
       Map<String, Object> result = Maps.newHashMap();
       DBCollection collection = MongoUtil.getGGf10dbCollection("t_f10_index_basicinfo");
       // 查询条件
       DBObject query = new BasicDBObject();
       query.put("status", 1);
       query.put("symbol", indexCode);

       // 查询字段
       DBObject fields = new BasicDBObject();
       fields.put("index_name", 1);//指数名称
       fields.put("indexsname", 1);//指数简称
       fields.put("symbol", 1);//指数代码
       fields.put("index_ename", 1);//指数英文名称
       fields.put("issuename", 1);//发布方式
       fields.put("publishdate", 1);//发布日期
       fields.put("cur", 1);//币种
       fields.put("benchdate", 1);//基准日期
       fields.put("benchnum", 1);//基准点数
       fields.put("consecurities", 1);//涵盖证券数
       fields.put("chgperiod", 1);//变动周期
       fields.put("index_type", 1);//指数类别
       fields.put("estclass", 1);//指数编制方式
       fields.put("weimode", 1);//指数加权方式
       fields.put("mcap", 1);//流通市值
       fields.put("mcapital", 1);//流通股本
       fields.put("_id", 0);

       DBObject obj = collection.findOne(query, fields);
       if (obj !=null) {
           obj.put("mcap", obj.getDouble("mcap")/100000000);//流通市值
           obj.put("mcapital", obj.getDouble("mcapital")/100000000);//流通股本
       }
       result.put("index_basic", obj.toMap());
       /**
       * 指数行业强弱
       */
       collection = MongoUtil.getGGStockBaseCollection("gg_cihdquote");

       query = new BasicDBObject();
       query.append("symbol", new BasicDBObject("$in", Arrays.asList("000300", indexCode)));
       query.append("tdate", new BasicDBObject("$gte", DateUtil.dateToInteger(DateUtil.getDateBeforeMonths(new Date(), 12), "yyyyMMdd")));

       fields = new BasicDBObject();
       fields.put("tdate", 1);//交易日期
       fields.put("symbol", 1);//指数代码
       fields.put("tclose", 1);//当前收盘价

       BasicDBObject sort = new BasicDBObject();
       sort.append("tdate", 1);

       DBCursor cursor = collection.find(query, fields).sort(sort);
       Map<Date, Double> maphs = Maps.newLinkedHashMap();
       Map<Date, Double> mapindex = Maps.newLinkedHashMap();
       while(cursor.hasNext()) {
           DBObject o = cursor.next();
           if(o.getString("symbol").equals("000300")){
               maphs.put(DateUtil.integerToDate(o.getInteger("tdate")), o.getDouble("tclose"));
           }else {
               mapindex.put(DateUtil.integerToDate(o.getInteger("tdate")), o.getDouble("tclose"));
           }
       }
       cursor.close();

       Map<String, Object> mapIntensity = Maps.newLinkedHashMap();
       mapIntensity.put("hs300", maphs);//沪深300指数
       mapIntensity.put("currentIndex", mapindex);//当前指数
       result.put("index_intensity", mapIntensity);
       /**
       * 成分行业
       */
       collection = MongoUtil.getGGf10dbCollection("t_f10_index_industry_code");

       query = new BasicDBObject();
       query.append("status", 1);
       query.append("symbol", indexCode);

       fields = new BasicDBObject();
       fields.put("industrycode", 1);//指数成分股分布行业代码
       fields.put("industryname", 1);//指数成分股分布行业名称
       fields.put("indstock_zb", 1);//指数行业成分股占比
       fields.put("_id", 0);

       List<Map<String, Object>> list_industry = Lists.newArrayList();
       cursor = collection.find(query, fields);
       List<String> industrys = Lists.newArrayList();
       while(cursor.hasNext()) {
           DBObject o = cursor.next();
           industrys.add(o.getString("industrycode"));
           list_industry.add(o.toMap());
       }
       cursor.close();

       Map<String, Integer> map = getIndustryStockCodes(industrys, indexCode);
       for(Map<String, Object> indus: list_industry){
           String industrycode = indus.get("industrycode").toString();
           indus.put("stock_code_count", map.get(industrycode));
       }
       result.put("index_industry", list_industry);
       return result;
   }
   /**
   *
   * 方法描述获取行业代码对应的个股数量
   *
   * @param industrys
   * @return
   * @date 2017年9月22日下午3:29:55
   */
   public static Map<String, Integer> getIndustryStockCodes(List<String> industrys, String indexCode){
       Map<String, Integer> map = Maps.newHashMap();

       List<String> indexCodes = IndexCapitalService.getStockByIndex(indexCode);
       DBCollection collection = MongoUtil.getGGStockBaseCollection("stock_sw_industry");
       BasicDBObject match = new BasicDBObject();
       match.append("$match", new BasicDBObject("sw_second_code", new BasicDBObject("$in", industrys.toArray()))
                       .append("stock_code", new BasicDBObject("$in", indexCodes.toArray())));

       BasicDBObject group = new BasicDBObject();
       group.append("$group", new BasicDBObject("_id", "$sw_second_code")
       .append("count", new BasicDBObject("$sum", 1)));
       AggregationOutput out = collection.aggregate(match, group);
       for(Iterator<DBObject> it=out.results().iterator();it.hasNext();){
           DBObject obj = it.next();
           map.put(obj.getString("_id"), obj.getInteger("count"));
       }
       return map;
   }
   /**
   *
   * 方法描述获取市场要闻
   *
   * @param indexCode 指数代码
   * @return
   * @date 2017年9月11日下午5:41:13
   */
   public static Map<String, Object> getIndexMarketNews(String indexCode){

       Map<String, Object> map = Maps.newHashMap();

       DBCollection collection = MongoUtil.getGGf10dbCollection("t_f10_index_industry_code");

       BasicDBObject query = new BasicDBObject();
       query.append("status", 1);
       query.append("symbol", indexCode);

       BasicDBObject fields = new BasicDBObject();
       fields.put("industrycode", 1);//指数成分股分布行业代码
       fields.put("industryname", 1);//指数成分股分布行业名称
       fields.put("reportcnt", 1);//指数行业月度报告数量
       fields.put("industryname1", 1);//申万一级行业名称
       fields.put("_id", 0);

       List<Map<String, Object>> list_report = Lists.newArrayList();
       double reportNum = 0;//指数对应的研报总数
       DBCursor cursor = collection.find(query, fields);
       while(cursor.hasNext()) {
           DBObject o = cursor.next();
           reportNum += o.getInteger("reportcnt");
           list_report.add(o.toMap());
       }
       cursor.close();

       for(Map<String, Object> report:list_report) {
           if(reportNum != 0) {
               report.put("report_rate", Double.parseDouble(report.get("reportcnt").toString())/reportNum);
           } else {
               report.put("report_rate", 0);
           }
       }

       map.put("news_report", list_report);

       Collections.sort(list_report, new Comparator<Map<String, Object>>() {

           @Override
           public int compare(Map<String, Object> o1, Map<String, Object> o2) {
               Double reportNum1 = Double.parseDouble(o1.get("report_rate").toString());
               Double reportNum2 = Double.parseDouble(o2.get("report_rate").toString());
               return reportNum2.compareTo(reportNum1);
           }
       });

       if(list_report.size() > 5) {
           list_report = list_report.subList(0, 5);
       }
       List<String> industrys = Lists.newArrayList();
       for(Map<String, Object> report:list_report) {
           industrys.add(report.get("industryname1").toString());
       }
       DBCollection urlcontents = MongoUtil.getGGStockCollection("urlcontents");

       /**
       * 产业新闻
       */
       map.put("news_industry", getIndustryNews(urlcontents, industrys));
       /**
       * 政策动态
       */
       map.put("news_policy", getPolicyNew(urlcontents));
       return map;
   }
   /**
   *
   * 方法描述获取行业要闻
   *
   * @param urlcontents
   * @param industrys
   * @return
   * @date 2017年9月12日下午1:39:03
   */
   public static List<Map<String, Object>> getIndustryNews(DBCollection urlcontents, List<String> industrys){
       List<Map<String, Object>> list = Lists.newArrayList();
       List<String> key = Lists.newArrayList();
       for(String industry:industrys){
           DBCursor cursor = urlcontents.find(new BasicDBObject("ir_groupname", industry)
                   .append("ir_urlcontent", new BasicDBObject("$nin", Arrays.asList(null, ""))),
                   new BasicDBObject("ir_groupname", 1)
                   .append("ir_urlcontent", 1)
                   .append("ir_urltitle", 1)
                   .append("ir_srcname", 1)
                   .append("ir_urltime", 1)
                   .append("ir_hkey", 1)
                   .append("_id", 0)
                   ).sort(new BasicDBObject("ir_urltime", -1)).limit(1);
           if(cursor.hasNext()){
               DBObject obj = cursor.next();
               String ir_hkey = obj.getString("ir_hkey");
               if(!key.contains(ir_hkey)) {
                   key.add(ir_hkey);
                   list.add(obj.toMap());
               }
           }
           cursor.close();
       }

       return list;
   }
   /**
   *
   * 方法描述获取政策动态
   *
   * @param urlcontents
   * @return
   * @date 2017年9月12日上午10:59:00
   */
   public static List<Map<String, Object>> getPolicyNew(DBCollection urlcontents) {

       BasicDBObject urlQuery = new BasicDBObject();

       urlQuery.append("ir_groupname", "政策动态");


       BasicDBObject fields = new BasicDBObject();
       fields.append("_id", 0);
       fields.append("ir_urlcontent", 1);//新闻内容
       fields.append("ir_urltitle", 1);//标题
       fields.append("ir_srcname", 1);//来源
       fields.append("ir_urltime", 1);//时间
       fields.append("ir_hkey", 1);//主键

       DBCursor urlCursor = urlcontents.find(urlQuery, fields).sort(new BasicDBObject("ir_urltime", -1)).limit(5);
       List<Map<String, Object>> result = new ArrayList<Map<String, Object>>();
       while(urlCursor.hasNext()){
           DBObject o = urlCursor.next();

           if(o.get("ir_urlcontent")==null || StringUtils.isBlank(o.get("ir_urlcontent").toString())){
               continue;
           }

           Map<String, Object> map = new HashMap<String, Object>();
           String summary = o.get("ir_urlcontent").toString();
           summary = summary.replaceAll("\r\n", "");
           summary = HtmlUtils.trimHtml(summary);
           if(summary.length() > 200){
               summary = summary.substring(0, 200);
           }
           map.put("summary", summary);//摘要
           map.put("title", o.get("ir_urltitle"));
           map.put("source", o.get("ir_srcname"));
           map.put("date", o.get("ir_urltime"));
           map.put("id", o.get("ir_hkey"));
           result.add(map);
       }
       urlCursor.close();

       return result;
   }
   /**
   *
   * 方法描述获取预期研究
   *
   * @param indexCode
   * @return
   * @date 2017年9月12日下午5:34:10
   */
   public static Map<String, Object> getConsensusExpecData(String indexCode){

       Map<String, Object> map = Maps.newHashMap();
       DBCollection collection = MongoUtil.getGGf10dbCollection("t_f10_index_industry_code");

       BasicDBObject query = new BasicDBObject();
       query.append("status", 1);
       query.append("symbol", indexCode);

       BasicDBObject fields = new BasicDBObject();
       fields.put("industrycode1", 1);//申万一级行业代码
       fields.put("industryname1", 1);//申万一级行业名称
       fields.put("industryname", 1);//申万二级行业名称
       fields.put("industrycode", 1);//申万二级行业代码
       fields.put("_id", 0);

       List<Map<String, Object>> list_report = Lists.newArrayList();
       DBCursor cursor = collection.find(query, fields);
       while(cursor.hasNext()) {
           DBObject o = cursor.next();
           list_report.add(o.toMap());
       }
       cursor.close();

       if(list_report.size() > 5) {
           list_report = list_report.subList(0, 5);
       }
       collection = MongoUtil.getGGIndustryCollection("t_hy_consensus");
       for(Map<String, Object> report:list_report) {
           map.put(report.get("industryname").toString(), getExpected(collection, report.get("industrycode").toString()));
       }
return map;
   }
   /**
   *
   * 方法描述获取一致预期数据
   *
   * @param collection
   * @param industry
   * @date 2017年9月12日下午5:40:23
   */
   public static List<Map<String, Object>> getExpected(DBCollection collection, String industry){
       Map<String, Object> baseYear = IndustryUtil.getDmYear(new Date()); // 获取基准年，以及基准年前1年和后3年的预测年
       List<Integer> yearList = new ArrayList<Integer>();// 预测年集合
       for (Object value : baseYear.values()) {
           yearList.add(Integer.parseInt(value.toString()));
       }
       Collections.sort(yearList);

       BasicDBObject query = new BasicDBObject();
       query.put("industrycode", industry);

DBObject field = new BasicDBObject();
field.put("_id", 0);
field.put("eps", 1);// 一致预期EPS
field.put("eps_tb", 1);// 一致预期EPS同比
field.put("profit", 1);// 一致预期净利润（万元）
field.put("profit_tb", 1);// 一致预期净利同比
field.put("pe", 1);// 一致预期PE（倍）
field.put("pb", 1);// 一致预期PB（倍）
field.put("forecast_income", 1);// 一致预期营业收入（万元）
field.put("income_tb", 1);// 一致预期营业收入同比
field.put("ps", 1);// 一致预期ps
field.put("time_year", 1);// 预测年

DBCursor cursor = collection.find(query, field);
List<Map<String, Object>> result = GGDBCursor.find(cursor, ImmutableMap.<String, String>of("profit_tb", "profittb"), 0, 0);
List<Map<String, Object>> resultList = null;// 返回结果集
if (result != null && !result.isEmpty()) {
   Map<String, Map<String, Object>> tmp = new HashMap<String, Map<String, Object>>();
   for (Map<String, Object> map : result) {
       if (map.get("income_tb") != null) {
           map.put("income_tb", Double.parseDouble(map.get("income_tb").toString()) * 100);
               }
       map.put("profit", map.get("profit")==null?null:Double.valueOf(map.get("profit").toString())/100000000);// 一致预期净利润
       map.put("forecast_income", map.get("forecast_income")==null?null:Double.valueOf(map.get("forecast_income").toString())/100000000);// 一致预期营业收入

       tmp.put(map.get("time_year").toString(), map);
   }
   resultList = new ArrayList<Map<String,Object>>();
   for (Integer year : yearList) {
       if (tmp.get(year.toString()) != null) {
           resultList.add(tmp.get(year.toString()));
       } else {
           Map<String, Object> map = new LinkedHashMap<String, Object>();
           map.put("eps", null);
           map.put("eps_tb", null);
           map.put("profit", null);
           map.put("profit_tb", null);
           map.put("pe", null);
           map.put("pb", null);
           map.put("forecast_income", null);
           map.put("income_tb", null);
           map.put("ps", null);
           map.put("time_year", year);
           resultList.add(map);
       }
   }
}
return resultList;
   }
}

重要提示：提问者不能发表回复，可以通过评论与回答者沟通，沟通后可以通过编辑功能完善问题描述，以便后续其他人能够更容易理解问题.

课程推荐：实战秘技，经典案例，一门课程锤炼数据分析师八大能力！

1 个回复

1

seng - 从事BI、大数据、数据分析工作 2018-06-09 回答

要回复问题请先登录或注册

怎么使用python取出一个目录下面所有文件的指定内容

1 个回复

发起人

相关问题

问题状态

怎么使用python取出一个目录下面所有文件的指定内容

与内容相关的链接

1 个回复

发起人

相关问题

问题状态