解决Out Of Memory问题实战-51CTO.COM

最近用solr进行了一个做索引的测试，在长时间运行做索引的程序之后，会出现堆内存溢出的错误。本文Po出简单代码，并对该问题进行分析和解决。

solr版本为5.5.0，使用三台服务器配置solr集群，solr以cloud方式启动，使用自己配置的zookeeper。在solr上新建一个数据集，并分为3片，每片配置两个replica，交叉备份。

要做索引的数据量是2600+万，存储在MySql数据库表中，数据一直在更新。一次从数据库表中查询5000条数据。solr搜索主要针对标题和内容，因此需要将表中的标题和内容做到solr中。其中内容占用空间非常大，在数据库中使用mediumtext进行存储。

数据集的配置如下：

<field name="id" type="string" indexed="true" stored="true" required="true" />  
<field name="title" type="text_ik" indexed="true" stored="true" /> 
<field name="url" type="string" indexed="false" stored="true" /> 
<field name="intime" type="string" indexed="true" stored="true"/> 
<field name="content" type="text_ik" indexed="true" stored="false"/> 
<!-- for title and content --> 
<field name="allcontent" type="text_ik" indexed="true" stored="false" multiValued="true"/> 
<copyField source="title" dest="allcontent" />       
<copyField source="content" dest="allcontent" />

搜索模式分为标题检索和全文检索，因此配置了allcontent复合字段，将标题和内容都放到这里。

做索引的程序使用Java实现，具体思路如下：

由于数据一直在更新，因此使用while(true)循环进行处理，一次循环查询5000条数据;
数据量很大，如果程序出现异常停止运行，要保证下次重新启动时从上次停的“点”继续做索引，因此要将这个“点”存储在文件中，防止丢失，本程序使用数据插入时间作为这个“点”;
一次查询5000条数据做处理，统一插入到solr中。

介绍了这么多，终于把前提说完了，下面上类图和具体代码，说明问题。

public abstract class SolrAbstract{ 
  
    public static final Logger log = Logger.getLogger(SolrAbstract.class); 
      
    public HttpSolrClient server; 
    public List data; // 数据库中需要处理的数据 
    public Collection docs = new CopyOnWriteArrayList(); 
      
    public  SolrAbstract(HttpSolrClient server) throws IOException, SolrServerException { 
        log.info("开始做索引");   
        if(server==null) 
            throw new SolrServerException("server不能为空"); 
        this.server = new HttpSolrClient(getUrl()); 
    } 
      
    public SolrAbstract()throws SolrServerException,IOException{ 
        log.info("开始做索引"); 
        this.server = new HttpSolrClient(getUrl()); 
    } 
  
    public SolrAbstract(List data) throws IOException, SolrServerException { 
        if(data == null || data.isEmpty()) { 
            try { 
                throw new InvalidParameterException("List不能为空"); 
            } catch (InvalidParameterException e) { 
                e.printStackTrace(); 
            } 
        } 
        this.data = data; 
    } 
  
    public String getUrl() { 
        return "http://192.168.20.10:8983/solr/test/"; // test为数据集名称 
    } 
} 
  
public class DoIndex extends SolrAbstract { 
      
    public DoIndex(String url) throws SolrServerException, IOException { 
        super(); 
    } 
      
    public void process() throws Exception { 
        for (int i = 0; i < this.data.size(); i++) { 
            Product p = (Product) this.data.get(i); 
            SolrInputDocument doc = new SolrInputDocument(); 
            doc.addField("id", p.getId()); 
            doc.addField("title", p.getTitle()); 
            doc.addField("url", p.getUrl()); 
            doc.addField("intime", p.getIntime()); 
            doc.addField("content", p.getContent()); 
            doc.addField("content", p.getContent()); 
            docs.add(doc); 
        } 
    } 
  
    public synchronized void commitIndex() throws IOException, SolrServerException { 
        long start = System.currentTimeMillis(); 
        if (docs.size() > 0) { 
            server.add(docs); 
        }                
        server.commit(); 
        long endTime = System.currentTimeMillis(); 
        log.info("提交索引花费时间："+((endTime - start))); 
        docs.clear(); 
        log.info("结束做索引"); 
    } 
} 
  
public class ProcessData { 
      
    DoIndex index ; 
    private JdbcUtil jdbc; 
    private static String RECORD_INTIME ; 
      
    public ProcessData(JdbcUtil jdbc){ 
        try { 
            RECORD_INTIME = "/home/solr/recordIntime.txt"; 
            this.jdbc = jdbc; 
            index = new DoIndex(); 
        } catch (Exception e) { 
            e.printStackTrace(); 
        } 
    } 
  
    public void processData() throws Exception{ 
        int startTime = Integer.parseInt(FileUtils.readFiles(RECORD_INTIME)); // ***startTime=0，从文件中读取记录时间 
        String sql = "select id,title,content,url,intime from testTable where intime>startTime limit 5000; 
        List<HashMap> list = jdbc.queryList(sql); 
        while(list!=null&&list.size()>0){ 
            index.data = new ArrayList<Product>(); 
            for (int i = 0; i < list.size(); i++) { 
                Map<String,Object> item =  list.get(i); 
                Product p = new Product(); 
                p.setId(item.get("id").toString()); 
                p.setTitle(item.get("title").toString()); 
                p.setUrl(item.get("url").toString()); 
                p.setIntime(item.get("intime").toString()); 
                p.setContent(item.get("content").toString()); 
                index.data.add(p); 
                startTime = (int)item.get("intime"); 
            }        
            index.process(); // 组装索引数据 
            index.commitIndex(); // 提交索引 
            index.data.clear(); 
            list.clear(); 
            FileUtils.writeFiles(startTime, RECORD_INTIME); // 将***的时间写入到文件中 
        } 
    } 
}

上述代码在小数据量短时间内测试没有问题，但运行几个小时之后报错堆内存溢出。

检查程序，发现SolrAbstract类中定义了两个成员变量data和docs，这两个都是“大对象”，虽然在程序中都进行了clear()，但还是怀疑JVM并没有及时清理这两个对象引用的对象。还有processData()方法中将从数据库查询的数据存入list中，这样可能也会导致内存不会被及时回收。

抱着试试看的态度对程序进行了修改。修改后的程序如下：

public class ProcessData { 
      
    private JdbcUtil jdbc; 
    private static String RECORD_INTIME ; 
    public ProcessData(JdbcUtil jdbc){ 
        try { 
            RECORD_INTIME = "/home/solr/recordIntime.txt"; 
            this.jdbc = jdbc; 
        } catch (Exception e) { 
            e.printStackTrace(); 
        } 
    } 
  
    public void processData() throws Exception{ 
        int startTime = Integer.parseInt(FileUtils.readFiles(RECORD_INTIME)); // ***startTime=0，从文件中读取记录时间 
        String sql = "select id,title,content,url,intime from testTable where intime>startTime limit 5000; 
        ResultSet rs = null; 
        try{ 
            rs = jdbc.query(sql); // 直接使用ResultSet获取数据结果，不再将结果存入list中 
            List list = new ArrayList(); 
            while(rs!=null&&rs.next()){ 
                SolrInputDocument doc = new SolrInputDocument(); 
                doc.addField("id", rs.getInt("id")); 
                doc.addField("title",rs.getString("title")); 
                doc.addField("url",rs.getString("url")); 
                doc.addField("intime",rs.getInt("intime")); 
                doc.addField("content", rs.getString("content")); 
                list.add(doc); 
            } 
            commitData(list); 
            list.clear(); 
            list.removeAll(list); 
            list = null; 
              
        }catch(Exception e) { 
            e.printStackTrace(); 
        }finally { 
            try{ 
                if(rs!=null) { 
                    rs.close(); 
                    rs = null; 
                } 
            }catch(Exception e) { 
                e.prepareStatement(); 
            } 
        } 
    } 
      
    public void commitData(Collection docs) { 
        try { 
            long start = System.currentTimeMillis(); 
            if (docs.size() > 0) { 
                server.add(docs); 
            } 
            log.info("当前占用内存: " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory())); 
            server.commit(); 
            long endTime = System.currentTimeMillis(); 
            log.info("提交索引时间："+((endTime - start))); 
            docs.clear(); 
            docs = null; 
            log.info("提交索引结束"); 
        } catch (SolrServerException e) { 
            e.printStackTrace(); 
        } catch (IOException e) { 
            e.printStackTrace(); 
        } 
    } 
}

代码进行上述修改后，运行了几个小时，不再报堆内存溢出的错误了。

现在假设业务需求修改了，要求在查询5000条数据时，对每条数据进行处理：需要根据id去其他表中查询修改的标题并写入索引中。

我在上述代码中直接进行了修改，在while(rs!=null&&rs.next())循环中加入了查询另外一张表的代码。运行程序发现当前占用的内存越来越多。于是我在服务器上使用了jstat查询当前虚拟机内存占用情况，命令如下：

jstat -gcutil pid 10000

10秒输出一次内存占用及垃圾回收情况，发现Young GC和Full GC非常频繁，并且Full GC之后，老年代内存回收情况并不好，监控如下：

这里可以看到第四列老年到刚开始只占用了28.64%，运行一段时间后内存占用量到81.22%，进行Full GC之后，仍然占用52.87%。

检查代码，发现是在while(rs!=null&&rs.next())里查询另外一张表的代码出现的问题。开发匆忙，我从网上随便找了一个数据库工具类进行的开发，发现里面的query方法是这样的：

public ResultSet query(String sql){ 
    ResultSet rs = null; 
    PreparedStatement ps = null; 
    try { 
        ps = conn.prepareStatement(sql); 
        rs = ps.executeQuery(); 
    } catch (SQLException e) { 
        e.printStackTrace(); 
    } 
    return rs; 
}

这段程序并没有及时释放ps，因为查询频繁，ps引用的对象一直得不到回收，导致这些对象进入了老年代，并且虚拟机检查这些对象仍然与GC Root有关联，因此导致老年代垃圾回收效果不好。也是这个原因导致的Young GC和Full GC非常频繁。

大致找到了问题原因，修改代码如下：

public void processData() throws Exception{ 
    int startTime = Integer.parseInt(FileUtils.readFiles(RECORD_INTIME)); // ***startTime=0，从文件中读取记录时间 
    String sql = "select id,title,content,url,intime from testTable where intime>startTime limit 5000; 
    ResultSet rs = null; 
    try{ 
        rs = jdbc.query(sql); // 直接使用ResultSet获取数据结果，不再将结果存入list中 
        List list = new ArrayList(); 
        while(rs!=null&&rs.next()){ 
            SolrInputDocument doc = new SolrInputDocument(); 
            doc.addField("id", rs.getInt("id")); 
            doc.addField("title",rs.getString("title")); 
            doc.addField("url",rs.getString("url")); 
            doc.addField("intime",rs.getInt("intime")); 
            doc.addField("content", rs.getString("content")); 
            PreparedStatement ps1 = jdbc.getConn().prepareStatement("select newtitle from testTable2 where id=?"); 
            ps1.setInt(1, rs.getInt("id")); 
            ResultSet rs1 = ps1.executeQuery(); 
            String newtitle = ""; 
            while(rs1!=null&&rs1.next()) { 
                newtitle = rs1.getString("newtitle"); 
            } 
            if(rs1!=null) { 
                rs1.close(); 
                rs1 = null; 
            } 
            if(ps1!=null) { 
                ps1.close(); 
                ps1 = null; 
            } 
            doc.addField("newtitle",newtitle); // 当然solr数据集的配置文件也需要修改，这里不再赘述 
            list.add(doc); 
        } 
        commitData(list); 
        list.clear(); 
        list.removeAll(list); 
        list = null; 
          
    }catch(Exception e) { 
        e.printStackTrace(); 
    }finally { 
        try{ 
            if(rs!=null) { 
                rs.close(); 
                rs = null; 
            } 
        }catch(Exception e) { 
            e.prepareStatement(); 
        } 
    } 
}

经过上面的修改，再次运行程序，不再发生内存溢出了，用jstat监控如下：

可以看到Young GC和Full GC正常了。Full GC在开始阶段基本没有被触发，Young GC也少了很多。而第四列的老年代回收情况也变的正常了。

上面的例子很简单，导致堆内存溢出的问题也比较常见。我想说的是看完一本书可能能被记住的内容并不多，但随着经验的积累和实践的增多，你会慢慢有一种感觉，能够大致定位到问题在哪里，这样就够了。

参考：《深入理解Java虚拟机：JVM高级特性与***实践(第2版)》

【本文为51CTO专栏作者“王森丰”的原创稿件，转载请注明出处】