Scraping the Web using Kotlin

A very popular library to use to scrape websites is JSoup. JSoup is a Java library, but can be easily used from Kotlin applications.

To get started with JSoup, add the JSoup dependency to to your Maven or Gradle project.

Maven

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.12.1</version>
</dependency>

Gradle

compile group: 'org.jsoup', name: 'jsoup', version: '1.12.1'

JSoup example using Kotlin

To read a HTML page using JSoup in Kotlin, you can use the following code:

package com.kotlintips.jsoup

import org.jsoup.Jsoup

fun main() {
    val url = "https://news.ycombinator.com"
    val doc = Jsoup.connect(url).get()

    val title = doc.title()
    val links = doc.select("a[href]")

    // Display title
    println(title)

    // Display all links
    links.forEach { link ->
        println(link.attr("href"))
    }
}

JSoup ListLinks example

A slightly bigger example can be found below. It’s based on the ListLinks example provided by JSoup, but now rewritten in Kotlin:

package com.kotlintips.jsoup

import org.jsoup.Jsoup
import org.jsoup.helper.Validate

/**
 * Example program to list links from a URL.
 */
object ListLinks {

    @JvmStatic
    fun main(args: Array<String>) {
        Validate.isTrue(args.size == 1, "usage: supply url to fetch")
        val url = args[0]
        println("Fetching $url...")

        val doc = Jsoup.connect(url).get()
        val links = doc.select("a[href]")
        val media = doc.select("[src]")
        val imports = doc.select("link[href]")

        println("Media: (${media.size})")
        for (src in media) {
            if (src.tagName() == "img") {
                println(" * ${src.tagName()}: <${src.attr("abs:src")}>, ${src.attr("width")}${src.attr("height")} (${trim(src.attr("alt"), 20)}")
            } else {
                println(" * ${src.tagName()}: <${src.attr("abs:src")}>")
            }
        }

        println("Imports: (${imports.size})")
        for (link in imports) {
            println(" * ${link.tagName()} <${link.attr("abs:href")}> (${link.attr("rel")})")
        }

        println("Links: (${links.size})")
        for (link in links) {
            println(" * a: <${link.attr("abs:href")}>  (${trim(link.text(), 35)})")
        }
    }
    
    private fun trim(s: String, width: Int): String = if (s.length > width) s.substring(0, width - 1) + "." else s
}

Leave a Reply

Your email address will not be published. Required fields are marked *