HTML Tag Validation & Extraction Regular Expression

HTML tag validation and extraction are common needs in text processing, sanitization, and parsing pipelines. Whether you need to verify that a string is a well-formed HTML tag or pull all tags out of an HTML document, a carefully crafted regular expression can handle both tasks efficiently. This article provides two patterns: one for validating a complete tag string and one for extracting tags from arbitrary HTML content.

Validation: Is This a Valid HTML Tag?

^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$

Use this pattern with anchors (^ and $) to test whether an entire string is a single, well-formed HTML tag — opening, closing, or self-closing.

Extraction: Find All Tags in a String

<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>

The same pattern without anchors, combined with the global flag (g), extracts every HTML tag found within a larger block of HTML content.

Explanation

^ / $ — Start and end anchors (validation only). Omit these when extracting tags from a larger string.
< — Literal opening angle bracket.
\/? — Optional forward slash, matching closing tags like </div>.
[a-zA-Z][a-zA-Z0-9-]* — Tag name: must start with a letter, followed by letters, digits, or hyphens. Hyphens are required for custom elements such as <my-component>.
(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)* — Zero or more attributes:
- \s+ — Required whitespace before each attribute.
- [a-zA-Z_:][a-zA-Z0-9_.:-]* — Attribute name (covers standard and namespaced attributes like data-value or xml:lang).
- (?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))? — Optional attribute value: double-quoted, single-quoted, or unquoted.
\s*\/?>$ — Optional whitespace, optional self-closing slash, then the closing >.

Supported Tag Syntax

Opening tags: <div>, <p class="text">
Closing tags: </div>, </span>
Self-closing tags: <br/>, <img src="x.png" />
Boolean attributes: <input required>, <details open>
Quoted attribute values: double-quoted ("value") and single-quoted ('value')
Namespaced & data attributes: data-id="1", xml:lang="en"
Custom elements: <my-component>, <app-header>

Validation Implementation

const htmlTagRegex = /^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$/;
const isValidHtmlTag = (tag) => htmlTagRegex.test(tag);

import re

def is_valid_html_tag(tag):
  html_tag_regex = r"^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$"
  return re.match(html_tag_regex, tag) is not None

use regex::Regex;

fn is_valid_html_tag(tag: &str) -> bool {
  let html_tag_regex = Regex::new("^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$")
    .expect("Could not parse HTML tag validation regex");
  html_tag_regex.is_match(tag)
}

package main

import (
  "fmt"
  "regexp"
)

func isValidHtmlTag(tag string) bool {
  htmlTagRegex := `^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$`
  re := regexp.MustCompile(htmlTagRegex)
  return re.MatchString(tag)
}

import Foundation

func isValidHtmlTag(_ tag: String) -> Bool {
  let htmlTagRegex = "^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$"
  return NSPredicate(format: "SELF MATCHES %@", htmlTagRegex).evaluate(with: tag)
}

using System;
using System.Text.RegularExpressions;

class Application {
  static bool IsValidHtmlTag(string tag) {
    string htmlTagRegex = "^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$";
    return Regex.IsMatch(tag, htmlTagRegex);
  }
}

import java.util.regex.*;

public class Application {
  public static boolean isValidHtmlTag(String tag) {
    String htmlTagRegex = "^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$";
    Pattern pattern = Pattern.compile(htmlTagRegex);
    Matcher matcher = pattern.matcher(tag);
    return matcher.matches();
  }
}

<?php
function isValidHtmlTag($tag) {
  $htmlTagRegex = "^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>$";
  return preg_match("/" . $htmlTagRegex . "/", $tag);
}
?>

Extraction Implementation

const htmlTagExtractRegex = /<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>/g;
const extractHtmlTags = (html) => html.match(htmlTagExtractRegex) ?? [];

import re

def extract_html_tags(html):
  html_tag_extract_regex = r"<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>"
  return re.findall(html_tag_extract_regex, html)

use regex::Regex;

fn extract_html_tags(html: &str) -> Vec<&str> {
  let html_tag_extract_regex = Regex::new("<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>")
    .expect("Could not parse HTML tag extraction regex");
  html_tag_extract_regex.find_iter(html).map(|m| m.as_str()).collect()
}

package main

import (
  "regexp"
)

func extractHtmlTags(html string) []string {
  htmlTagExtractRegex := `<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>`
  re := regexp.MustCompile(htmlTagExtractRegex)
  return re.FindAllString(html, -1)
}

import Foundation

func extractHtmlTags(_ html: String) -> [String] {
  let htmlTagExtractRegex = try! NSRegularExpression(pattern: "<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>")
  let range = NSRange(location: 0, length: html.utf16.count)
  return htmlTagExtractRegex.matches(in: html, range: range).compactMap {
    Range($0.range, in: html).map { String(html[$0]) }
  }
}

using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;

class Application {
  static List<string> ExtractHtmlTags(string html) {
    string htmlTagExtractRegex = "<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>";
    var matches = Regex.Matches(html, htmlTagExtractRegex);
    var result = new List<string>();
    foreach (Match match in matches) result.Add(match.Value);
    return result;
  }
}

import java.util.ArrayList;
import java.util.List;
import java.util.regex.*;

public class Application {
  public static List<String> extractHtmlTags(String html) {
    String htmlTagExtractRegex = "<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>";
    Pattern pattern = Pattern.compile(htmlTagExtractRegex);
    Matcher matcher = pattern.matcher(html);
    List<String> tags = new ArrayList<>();
    while (matcher.find()) tags.add(matcher.group());
    return tags;
  }
}

<?php
function extractHtmlTags($html) {
  $htmlTagExtractRegex = "<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>"']*))?)*\s*\/?>";
  preg_match_all("/" . $htmlTagExtractRegex . "/", $html, $matches);
  return $matches[0];
}
?>

Validation Test Cases

Each input is tested as a complete string to determine whether it is a valid HTML tag.

HTML Tag	Valid
<div>
<p>
<h1>
<span>
</div>
</p>
<br/>
<hr/>
<br />
<img />
<img src="image.png">
<a href="https://example.com" class="link">
<input type="text" required>
<div class='container'>
<INPUT TYPE='TEXT'>
<my-component>
<meta charset="UTF-8" />
<span data-value="123">
(empty string)
<>
<123>
< div>
<div
div>
hello world
<!-- comment -->

Extraction Test Cases

Each input is a string of HTML content. The result indicates whether the string contains at least one valid HTML tag.

HTML Content	Valid
<div>hello</div>
<p class="text">paragraph</p>
Click <a href="#">here</a>
<br/>
<ul><li>item</li></ul>
<img src="photo.jpg" alt="photo" />
hello world
(empty string)
<>
3 < 5 > 2