Solr: manage time-based collections

If you use Solr as your fulltext search engine, you may be frustated to miss the excellent tool Curator from Elastic, which allow you to manage your indices. Cloudera offers an admin tool for Solr, named solrctl, a light utility to supervise a SolrCloud deployment. Although solrctl has some useful commands, you don’t have the possibility to delete old time-based collections. Time-based collections, and globally shard/partition per time frame, is a common pattern for agregation but also for many other use cases. The idea is simple, the collections have names containing a date in a specific format (…-MM-DD-YYYY, …MM.DD.YYYY, etc). Instead of having one big collection with all the documents, this approch is very efficient for many reasons:

  • You don’t have to delete documents in one massive collection via complex delete query
  • You don’t have to reshard your collections if the size grows up too much
  • Reindexing the data will be less expensive if you work on many collections

(more on this on the elasticsearch documentation)

If you want to delete the old collections and keep only the collections not older more than 30 days, I suggest you in this post a simple tool. Not complete at all as the Curator can be, but it does the job.

First, we need some Maven dependencies. jcommander is used to parse the command line parameters and solrj to interact with our SolrCloud clusters.

<dependency>
    <groupId>org.apache.solr</groupId>
    <artifactId>solr-solrj</artifactId>
    <version>XXX</version>
</dependency>
<dependency>
    <groupId>com.beust</groupId>
    <artifactId>jcommander</artifactId>
    <version>1.60</version>
</dependency>

Now, let’s add some code:

package fr.layer4.solr.cli;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import lombok.Data;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.HttpClientUtil;
import org.apache.solr.client.solrj.impl.Krb5HttpClientConfigurer;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.common.StringUtils;

import java.io.Console;
import java.io.IOException;
import java.time.LocalDate;
import java.time.temporal.ChronoUnit;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

@Data
public class Main {

    @Parameter(names = "--help", help = true)
    private boolean help;

    @Data
    public static class AbstractCommand {
        @Parameter(names = "--kerberos", description = "Use kerberos and JAAS config from -Djava.security.auth.login.config=...")
        private boolean kerberos;

        @Parameter(names = "--zk", description = "Zookeeper connection string", required = true)
        private String zk;
    }

    @Data
    public static class AbstractCommandList extends AbstractCommand {

        @Parameter(names = "--prefix", description = "Prefix")
        private String prefix;

        @Parameter(names = "--pattern", description = "Pattern. Example: logstash-%Y.%m.%d")
        private String pattern;

        @Parameter(names = "--newer-than", description = "Newer than...")
        private Integer newerThan;

        @Parameter(names = "--older-than", description = "Older than...")
        private Integer olderThan;

        @Parameter(names = "--time-unit", description = "Time unit...")
        private String timeUnit;

        @Parameter(names = "--exclude", description = "Exclude collections")
        private List<String> exclude;
    }

    @Data
    @Parameters(commandDescription = "List collections")
    public static class CommandList extends AbstractCommandList {

    }

    @Data
    @Parameters(commandDescription = "Delete collections")
    public static class DeleteList extends AbstractCommandList {

        @Parameter(names = "--yes", description = "Force yes")
        private boolean yes;
    }

    public static void main(String... args) throws Exception {

        Main main = new Main();
        JCommander jc = new JCommander(main);

        CommandList list = new CommandList();
        jc.addCommand("list", list);
        DeleteList delete = new DeleteList();
        jc.addCommand("delete", delete);

        jc.parse(args);

        if (main.help) {
            jc.usage();
            return;
        }

        main.run(jc);
    }

    protected void run(JCommander jc) throws IOException, SolrServerException {

        String commandName = jc.getParsedCommand();
        AbstractCommand command = (AbstractCommand) jc.getCommands().get(commandName).getObjects().get(0);

        if (command.isKerberos()) {
            HttpClientUtil.setConfigurer(new Krb5HttpClientConfigurer());
        }

        List<String> collections;
        try (SolrClient solr = new CloudSolrClient.Builder().withZkHost(command.getZk()).build()) {
            CollectionAdminResponse response = CollectionAdminRequest.listCollections().process(solr);
            collections = (List<String>) response.getResponse().get("collections");

            LocalDate now = LocalDate.now();

            if ("list".equals(commandName)) {
                List<String> results = doList(now, collections, (AbstractCommandList) command);
                results.forEach(c -> out(c));
            }

            if ("delete".equals(commandName)) {
                List<String> results = doList(now, collections, (AbstractCommandList) command);

                Console console = System.console();

                DeleteList delete = (DeleteList) command;
                results.forEach(c -> {
                    out(c);

                    boolean hasToDelete = false;
                    if (delete.isYes()) {
                        hasToDelete = true;
                    } else {
                        String input = console.readLine("\t-> Delete? [yN]");
                        if (!StringUtils.isEmpty(input) && "y".equals(input)) {
                            hasToDelete = true;
                        } else {
                            out("\tSkipped");
                        }
                    }

                    if (hasToDelete) {
                        try {
                            CollectionAdminRequest.deleteCollection(c).process(solr);
                            out("\tDone!");
                        } catch (Exception e) {
                            err("\tError:");
                            e.printStackTrace();
                        }
                    }
                });
            }
        }
    }

    protected List<String> doList(LocalDate now, List<String> collections, AbstractCommandList list) {
        Stream<String> stream = collections.stream().sorted();

        if (list.getExclude() != null && list.getExclude().size() > 0) {
            stream = stream.filter(c -> !list.getExclude().contains(c));
        }

        if (!StringUtils.isEmpty(list.getPrefix())) {
            stream = stream.filter(c -> c.startsWith(list.getPrefix()));
        }

        if (!StringUtils.isEmpty(list.getPattern())) {
            String pattern = "(?i)^" + list.getPattern();

            pattern = pattern.replace("%Y", "(?<Y>\\d{4})");
            pattern = pattern.replace("%m", "(?<m>\\d{2})");
            pattern = pattern.replace("%d", "(?<d>\\d{2})");

            Pattern compiled = Pattern.compile(pattern);

            stream = stream.filter(c -> {
                Matcher matcher = compiled.matcher(c);
                if (!matcher.matches()) {
                    return false;
                }

                boolean result = true;
                if (list.getNewerThan() != null || list.getOlderThan() != null) {

                    String y = matcher.group("Y");
                    String m = matcher.group("m");
                    String d = matcher.group("d");

                    LocalDate time = LocalDate.of(Integer.valueOf(y), Integer.valueOf(m), Integer.valueOf(d));

                    if (list.getNewerThan() != null) {
                        result = time.isAfter(now.plus(list.getNewerThan(), ChronoUnit.valueOf(list.getTimeUnit().toUpperCase())));
                    }

                    if (list.getOlderThan() != null) {
                        result = time.isBefore(now.minus(list.getOlderThan(), ChronoUnit.valueOf(list.getTimeUnit().toUpperCase())));
                    }
                }

                return result;
            });
        }

        return stream.collect(Collectors.toList());
    }

    private void out(String log) {
        System.out.println(log);
    }

    private void err(String log) {
        System.err.println(log);
    }
}

For a real tool, use the Maven plugin appassembler and create an executable binary. Via the programs configuration, appassembler will generate a windows (.bat) and unix shell script in the bin folder in the project build directory. All dependencies and the artifact itself are copied in the assemble directory and a bin/ directory is created, containing the generated bin scripts.

<plugin>
    <groupId>org.codehaus.mojo</groupId>
    <artifactId>appassembler-maven-plugin</artifactId>
    <version>1.10</version>
    <executions>
        <execution>
            <id>daemon</id>
            <phase>package</phase>
            <goals>
                <goal>assemble</goal>
            </goals>
            <configuration>
                <repositoryLayout>flat</repositoryLayout>
                <repositoryName>lib</repositoryName>
                <assembleDirectory>${project.build.directory}/appassembler-jsw/solrcli</assembleDirectory>
                <logsDirectory>logs</logsDirectory>
                <programs>
                    <program>
                        <id>solrcli</id>
                        <jvmSettings>
                            <initialMemorySize>512m</initialMemorySize>
                            <maxMemorySize>1024m</maxMemorySize>
                        </jvmSettings>
                        <mainClass>fr.layer4.solr.cli.Main</mainClass>
                        <id>solrcli</id>
                        <platforms>
                            <platform>windows</platform>
                            <platform>unix</platform>
                        </platforms>
                    </program>
                </programs>
            </configuration>
        </execution>
    </executions>
</plugin>

The scripts are in target/appassembler-jsw/solrcli/bin.

And now, let’s see the magic. First, list the collections:

bin/solrcli list --zk zkhost1:2181,zkhost2:2181,zkhost3:2181/solr

collection1
collection-test
logstash-2016.11.23
logstash-2016.11.24
logstash-2016.11.26
logstash-2016.11.27
logstash-2016.11.29
...

Or if you need authentication with Kerberos:

JAVA_OPTS="-Djava.security.auth.login.config=.../my.jaas.conf" && bin/solrcli list --zk zkhost1:2181,zkhost2:2181,zkhost3:2181/solr --kerberos

Next, try the pattern:

bin/solrcli list --zk zkhost1:2181,zkhost2:2181,zkhost3:2181/solr --pattern logstash-%Y.%m.%d

logstash-2016.11.23
logstash-2016.11.24
logstash-2016.11.26
logstash-2016.11.27
logstash-2016.11.29
...

And filter the old collections:

bin/solrcli list --zk zkhost1:2181,zkhost2:2181,zkhost3:2181/solr --pattern logstash-%Y.%m.%d --older-than 30 --time-unit days

logstash-2016.11.23
logstash-2016.11.24
logstash-2016.11.26

If you want to exclude some collections, use the –exclude:

bin/solrcli list --zk zkhost1:2181,zkhost2:2181,zkhost3:2181/solr --pattern logstash-%Y.%m.%d --older-than 30 --time-unit days --exclude logstash-2016-11-11 --exclude logstash-2016-11-12

The same options are available for the delete commands:

bin/solrcli delete --zk zkhost1:2181,zkhost2:2181,zkhost3:2181/solr --pattern logstash-%Y.%m.%d --older-than 30 --time-unit days

Delete is a user-prompted command, if you want to use it in batch, add –yes:

bin/solrcli delete --zk zkhost1:2181,zkhost2:2181,zkhost3:2181/solr --pattern logstash-%Y.%m.%d --older-than 30 --time-unit days --yes

Have fun!

Credits:
“Stars over Iceland” by Claudia Regina is licensed under CC BY-SA 2.0 / Resized

Related Posts

Leave a comment