이 모듈에 대한 설명문서는 모듈:languages/설명문서에서 만들 수 있습니다

local require = require

local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local mw = mw
local string = string
local table = table
local ustring = mw.ustring

local check_object = require("Module:utilities").check_object
local concat = table.concat
local decode_entities = m_str_utils.decode_entities
local decode_uri = m_str_utils.decode_uri
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local ipairs = ipairs
local load_data = mw.loadData
local match = string.match
local next = next
local pairs = pairs
local remove = table.remove
local remove_duplicates = m_table.removeDuplicates
local select = select
local setmetatable = setmetatable
local shallowcopy = m_table.shallowcopy
local split = m_str_utils.split
local type = type
local ugsub = ustring.gsub
local ulen = m_str_utils.len
local umatch = ustring.match

-- Loaded as needed by findBestScript.
local Hans_chars
local Hant_chars

local export = {}

--[=[
This module implements fetching of language-specific information and processing text in a given language.

There are two types of languages: full languages and etymology-only languages. The essential difference is that only
full languages appear in L2 headings in vocabulary entries, and hence categories like [[:Category:French nouns]] exist
only for full languages. Etymology-only languages have either a full language or another etymology-only language as
their parent (in the parent-child inheritance sense), and for etymology-only languages with another etymology-only
language as their parent, a full language can always be derived by following the parent links upwards. For example,
"Canadian French", code 'fr-CA', is an etymology-only language whose parent is the full language "French", code 'fr'.
An example of an etymology-only language with another etymology-only parent is "Northumbrian Old English", code
'ang-nor', which has "Anglian Old English", code 'ang-ang' as its parent; this is an etymology-only language whose
parent is "Old English", code "ang", which is a full language. (This is because Northumbrian Old English is considered
a variety of Anglian Old English.) Sometimes the parent is the "Undetermined" language, code 'und'; this is the case,
for example, for "substrate" languages such as "Pre-Greek", code 'qsb-grc', and "the BMAC substrate", code 'qsb-bma'.

It is important to distinguish language ''parents'' from language ''ancestors''. The parent-child relationship is one
of containment, i.e. if X is a child of Y, X is considered a variety of Y. On the other hand, the ancestor-descendant
relationship is one of descent in time. For example, "Classical Latin", code 'la-cla', and "Late Latin", code 'la-lat',
are both etymology-only languages with "Latin", code 'la', as their parents, because both of the former are varieties
of Latin. However, Late Latin does *NOT* have Classical Latin as its parent because Late Latin is *not* a variety of
Classical Latin; rather, it is a descendant. There is in fact a separate 'ancestors' field that is used to express the
ancestor-descendant relationship, and Late Latin's ancestor is given as Classical Latin. It is also important to note
that sometimes an etymology-only language is actually the conceptual ancestor of its parent language. This happens,
for example, with "Old Italian" (code 'roa-oit'), which is an etymology-only variant of full language "Italian" (code
'it'), and with "Old Latin" (code 'itc-ola'), which is an etymology-only variant of Latin. In both cases, the full
language has the etymology-only variant listed as an ancestor. This allows a Latin term to inherit from Old Latin
using the {{tl|inh}} template (where in this template, "inheritance" refers to ancestral inheritance, i.e. inheritance
in time, rather than in the parent-child sense); likewise for Italian and Old Italian.

Full languages come in three subtypes:
* {regular}: This indicates a full language that is attested according to [[WT:CFI]] and therefore permitted in the
			 main namespace. There may also be reconstructed terms for the language, which are placed in the
			 {Reconstruction} namespace and must be prefixed with * to indicate a reconstruction. Most full languages
			 are natural (not constructed) languages, but a few constructed languages (e.g. Esperanto and Volapük,
			 among others) are also allowed in the mainspace and considered regular languages.
* {reconstructed}: This language is not attested according to [[WT:CFI]], and therefore is allowed only in the
				   {Reconstruction} namespace. All terms in this language are reconstructed, and must be prefixed with
				   *. Languages such as Proto-Indo-European and Proto-Germanic are in this category.
* {appendix-constructed}: This language is attested but does not meet the additional requirements set out for
						  constructed languages ([[WT:CFI#Constructed languages]]). Its entries must therefore be in
						  the Appendix namespace, but they are not reconstructed and therefore should not have *
						  prefixed in links. Most constructed languages are of this subtype.

Both full languages and etymology-only languages have a {Language} object associated with them, which is fetched using
the {getByCode} function in [[Module:languages]] to convert a language code to a {Language} object. Depending on the
options supplied to this function, etymology-only languages may or may not be accepted, and family codes may be
accepted (returning a {Family} object as described in [[Module:families]]). There are also separate {getByCanonicalName}
functions in [[Module:languages]] and [[Module:etymology languages]] to convert a language's canonical name to a
{Language} object (depending on whether the canonical name refers to a full or etymology-only language).

Textual strings belonging to a given language come in several different ''text variants'':
# The ''input text'' is what the user supplies in wikitext, in the parameters to {{tl|m}}, {{tl|l}}, {{tl|ux}},
  {{tl|t}}, {{tl|lang}} and the like.
# The ''display text'' is the text in the form as it will be displayed to the user. This can include accent marks that
  are stripped to form the entry text (see below), as well as embedded bracketed links that are variously processed
  further. The display text is generated from the input text by applying language-specific transformations; for most
  languages, there will be no such transformations. Examples of transformations are bad-character replacements for
  certain languages (e.g. replacing 'l' or '1' to [[palochka]] in certain languages in Cyrillic); and for Thai and
  Khmer, converting space-separated words to bracketed words and resolving respelling substitutions such as [กรีน/กฺรีน],
  which indicate how to transliterate given words.
# The ''entry text'' is the text in the form used to generate a link to a Wiktionary entry. This is usually generated
  from the display text by stripping certain sorts of diacritics on a per-language basis, and sometimes doing other
  transformations. The concept of ''entry text'' only really makes sense for text that does not contain embedded links,
  meaning that display text containing embedded links will need to have the links individually processed to get
  per-link entry text in order to generate the resolved display text (see below).
# The ''resolved display text'' is the result of resolving embedded links in the display text (e.g. converting them to
  two-part links where the first part has entry-text transformations applied, and adding appropriate language-specific
  fragments) and adding appropriate language and script tagging. This text can be passed directly to MediaWiki for
  display.
# The ''source translit text'' is the text as supplied to the language-specific {transliterate()} method. The form of
  the source translit text may need to be language-specific, e.g Thai and Khmer will need the full unprocessed input
  text, whereas other languages may need to work off the display text. [FIXME: It's still unclear to me how embedded
  bracketed links are handled in the existing code.] In general, embedded links need to be removed (i.e. converted to
  their "bare display" form by taking the right part of two-part links and removing double brackets), but when this
  happens is unclear to me [FIXME]. Some languages have a chop-up-and-paste-together scheme that sends parts of the
  text through the transliterate mechanism, and for others (those listed in {contiguous_substition} in
  [[Module:languages/data]]) they receive the full input text, but preprocessed in certain ways. (The wisdom of this is
  still unclear to me.)
# The ''transliterated text'' (or ''transliteration'') is the result of transliterating the source translit text.
  Unlike for all the other text variants except the transcribed text, it is always in the Latin script.
# The ''transcribed text'' (or ''transcription'') is the result of transcribing the source translit text, where
  "transcription" here means a close approximation to the phonetic form of the language in languages (e.g. Akkadian,
  Sumerian, Ancient Egyptian, maybe Tibetan) that have a wide difference between the written letters and spoken form.
  Unlike for all the other text variants other than the transliterated text, it is always in the Latin script.
  Currently, the transcribed text is always supplied manually be the user; there is no such thing as a
  {lua|transcribe()} method on language objects.
# The ''sort key'' is the text used in sort keys for determining the placing of pages in categories they belong to. The
  sort key is generated from the pagename or a specified ''sort base'' by lowercasing, doing language-specific
  transformations and then uppercasing the result. If the sort base is supplied and is generated from input text, it
  needs to be converted to display text, have embedded links removed (i.e. resolving them to their right side if they
  are two-part links) and have entry text transformations applied.
# There are other text variants that occur in usexes (specifically, there are normalized variants of several of the
  above text variants), but we can skip them for now.

The following methods exist on {Language} objects to convert between different text variants:
# {makeDisplayText}: This converts input text to display text.
# {lua|makeEntryName}: This converts input or display text to entry text. [FIXME: This needs some rethinking. In
  particular, {lua|makeEntryName} is sometimes called on display text (in some paths inside of [[Module:links]]) and
  sometimes called on input text (in other paths inside of [[Module:links]], and usually from other modules). We need
  to make sure we don't try to convert input text to display text twice, but at the same time we need to support
  calling it directly on input text since so many modules do this. This means we need to add a parameter indicating
  whether the passed-in text is input or display text; if that former, we call {lua|makeDisplayText} ourselves.]
# {lua|transliterate}: This appears to convert input text with embedded brackets removed into a transliteration.
  [FIXME: This needs some rethinking. In particular, it calls {lua|processDisplayText} on its input, which won't work
  for Thai and Khmer, so we may need language-specific flags indicating whether to pass the input text directly to the
  language transliterate method. In addition, I'm not sure how embedded links are handled in the existing translit code;
  a lot of callers remove the links themselves before calling {lua|transliterate()}, which I assume is wrong.]
# {lua|makeSortKey}: This converts entry text (?) to a sort key. [FIXME: Clarify this.]
]=]

local function track(page)
	require("Module:debug/track")("languages/" .. page)
	return true
end

local function normalize_code(code)
	return load_data("Module:languages/data").aliases[code] or code
end

-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
local function escape_risky_characters(text)
	local encode_entities = require("Module:string/encode entities")
	-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
	if umatch(text, "^%s*$") then
		return encode_entities(text, text)
	else
		return encode_entities(text, "!#%&*+/:;<=>?@[\\]_{|}")
	end
end

-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process.
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim)
	-- Clone so that we don't insert any extra patterns into the table in package.loaded. For some reason, using require seems to keep memory use down; probably because the table is always cloned.
	local patterns = shallowcopy(require("Module:languages/data/patterns"))
	if keepCarets then
		insert(patterns, "((\\+)%^)")
		insert(patterns, "((%^))")
	end
	-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions.
	if not noTrim then
		insert(patterns, "^([\128-\191\244]*(%s+))")
		insert(patterns, "((%s+)[\128-\191\244]*)$")
	end
	-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate.
	text = text
		:gsub("%f[%[]%[%[", "\1")
		:gsub("%f[%]]%]%]", "\2")
	local i, pe = #subbedChars, require("Module:string utilities").pattern_escape
	for _, pattern in ipairs(patterns) do
		-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages.
		local term_divider
		pattern = pattern:gsub("%z$", function(divider)
			term_divider = divider == "\0"
			return ""
		end)
		text = text:gsub(pattern, function(...)
			local m = {...}
			local m1New = m[1]
			for k = 2, #m do
				local n = i + k - 1
				subbedChars[n] = m[k]
				local byte2 = math.floor(n / 4096) % 64 + (term_divider and 128 or 136)
				local byte3 = math.floor(n / 64) % 64 + 128
				local byte4 = n % 64 + 128
				m1New = m1New:gsub(pe(m[k]), "\244" .. string.char(byte2) .. string.char(byte3) .. string.char(byte4), 1)
			end
			i = i + #m - 1
			return m1New
		end)
	end
	text = text
		:gsub("\1", "%[%[")
		:gsub("\2", "%]%]")
	return text, subbedChars
end

-- Reinsert any formatting that was temporarily substituted.
local function undoTempSubstitutions(text, subbedChars)
	local pe = require("Module:string utilities").pattern_escape
	for i = 1, #subbedChars do
		local byte2 = math.floor(i / 4096) % 64 + 128
		local byte3 = math.floor(i / 64) % 64 + 128
		local byte4 = i % 64 + 128
		text = text:gsub("\244[" .. string.char(byte2) .. string.char(byte2+8) .. "]" .. string.char(byte3) .. string.char(byte4), pe(subbedChars[i]))
	end
	text = text
		:gsub("\1", "%[%[")
		:gsub("\2", "%]%]")
	return text
end
-- Check if the raw text is an unsupported title, and if so return that. Otherwise, remove HTML entities. We do the pre-conversion to avoid loading the unsupported title list unnecessarily.
local function checkNoEntities(self, text)
	local textNoEnc = decode_entities(text)
	if textNoEnc ~= text and self:loadData("Module:links/data").unsupported_titles[text] then
		return text
	else
		return textNoEnc
	end
end

-- If no script object is provided (or if it's invalid or None), get one.
local function checkScript(text, self, sc)
	if not check_object("script", true, sc) or sc:getCode() == "None" then
		return self:findBestScript(text)
	else
		return sc
	end
end

local function normalize(text, sc)
	text = sc:fixDiscouragedSequences(text)
	return sc:toFixedNFD(text)
end

-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them.
local function iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, substitution_data, function_name)
	local pe = require("Module:string utilities").pattern_escape
	local fail, cats, sections = nil, {}
	-- See [[Module:languages/data]].
	if not text:find("\244") or self:loadData("Module:languages/data").contiguous_substitution[self._code] then
		sections = {text}
	else
		sections = split(text, "\244[\128-\143][\128-\191]*", true)
	end
	for _, section in ipairs(sections) do
		-- Don't bother processing empty strings or whitespace (which may also not be handled well by dedicated modules).
		if section:gsub("%s+", "") ~= "" then
			local sub, sub_fail, sub_cats = require("Module:languages/doSubstitutions")(section, self, sc, substitution_data, function_name)
			-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text).
			if sub and subbedChars then
				local noSub
				for _, pattern in ipairs(require("Module:languages/data/patterns")) do
					if section:match(pattern .. "%z?") then
						noSub = true
					end
				end
				if not noSub then
					sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets, true)
				end
			end
			if (not sub) or sub_fail then
				text = sub
				fail = sub_fail
				cats = sub_cats or {}
				break
			end
			text = sub and text:gsub(pe(section), pe(sub), 1) or text
			if type(sub_cats) == "table" then
				for _, cat in ipairs(sub_cats) do
					insert(cats, cat)
				end
			end
		end
	end

	-- Trim, unless there are only spacing characters, while ignoring any final formatting characters.
	text = text and text
		:gsub("^([\128-\191\244]*)%s+(%S)", "%1%2")
		:gsub("(%S)%s+([\128-\191\244]*)$", "%1%2")

	-- Remove duplicate categories.
	if #cats > 1 then
		cats = remove_duplicates(cats)
	end

	return text, fail, cats, subbedChars
end

-- Process carets (and any escapes). Default to simple removal, if no pattern/replacement is given.
local function processCarets(text, pattern, repl)
	local rep
	repeat
		text, rep = text:gsub("\\\\(\\*^)", "\3%1")
	until rep == 0
	return text
		:gsub("\\^", "\4")
		:gsub(pattern or "%^", repl or "")
		:gsub("\3", "\\")
		:gsub("\4", "^")
end

-- Remove carets if they are used to capitalize parts of transliterations (unless they have been escaped).
local function removeCarets(text, sc)
	if not sc:hasCapitalization() and sc:isTransliterated() and text:match("%^") then
		return processCarets(text)
	else
		return text
	end
end

local Language = {}

function Language:loadData(modname)
	return (self._useRequire and require or mw.loadData)(modname)
end

--[==[Returns the language code of the language. Example: {{code|lua|"fr"}} for French.]==]
function Language:getCode()
	return self._code
end

function Language:getCanonicalName()
	return self._rawData.canonicalName
end


function Language:getOtherNames()
	return self._rawData.otherNames or {}
end


--[==[Returns a table of types as a lookup table (with the types as keys). 

The possible types are
* {language}: This is a language, either full or etymology-only.
* {full}: This is a "full" (not etymology-only) language, i.e. the union of {regular}, {reconstructed} and
		  {appendix-constructed}. Note that the types {full} and {etymology-only} also exist for families, so if you
		  want to check specifically for a full language and you have an object that might be a family, you should
		  use {{lua|hasType("language", "full")}} and not simply {{lua|hasType("full")}}.
* {etymology-only}: This is an etymology-only (not full) language, whose parent is another etymology-only
					language or a full language. Note that the types {full} and {etymology-only} also exist for
					families, so if you want to check specifically for an etymology-only language and you have an
					object that might be a family, you should use {{lua|hasType("language", "etymology-only")}}
					and not simply {{lua|hasType("etymology-only")}}.
* {regular}: This indicates a full language that is attested according to [[WT:CFI]] and therefore permitted
			 in the main namespace. There may also be reconstructed terms for the language, which are placed in
			 the {Reconstruction} namespace and must be prefixed with * to indicate a reconstruction. Most full
			 languages are natural (not constructed) languages, but a few constructed languages (e.g. Esperanto
			 and Volapük, among others) are also allowed in the mainspace and considered regular languages.
* {reconstructed}: This language is not attested according to [[WT:CFI]], and therefore is allowed only in the
				   {Reconstruction} namespace. All terms in this language are reconstructed, and must be prefixed
				   with *. Languages such as Proto-Indo-European and Proto-Germanic are in this category.
* {appendix-constructed}: This language is attested but does not meet the additional requirements set out for
						  constructed languages ([[WT:CFI#Constructed languages]]). Its entries must therefore
						  be in the Appendix namespace, but they are not reconstructed and therefore should
						  not have * prefixed in links.
]==]
function Language:getTypes()
	local types = self._types
	if types == nil then
		types = {language = true}
		if self:getFullCode() == self._code then
			types.full = true
		else
			types["etymology-only"] = true
		end
		-- self._rawData.type이 nil일 경우 빈 문자열로 처리
		for t in gmatch(self._rawData.type or "", "[^,]+") do
			types[t] = true
		end
		self._types = types
	end
	return types
end


--[==[Given a list of types as strings, returns true if the language has all of them.]==]
function Language:hasType(...)
	local args, types = {...}, self:getTypes()
	for i = 1, #args do
		if not types[args[i]] then
			return false
		end
	end
	return true
end


function Language:getWikimediaLanguages()
	if not self._wikimediaLanguageObjects then
		local m_wikimedia_languages = require("Module:wikimedia languages")
		self._wikimediaLanguageObjects = {}
		local wikimedia_codes = self._rawData.wikimedia_codes or { self._code }
		
		for _, wlangcode in ipairs(wikimedia_codes) do
			table.insert(self._wikimediaLanguageObjects, m_wikimedia_languages.getByCode(wlangcode))
		end
	end
	
	return self._wikimediaLanguageObjects
end


function Language:getWikipediaArticle()
	return self._rawData.wikipedia_article or mw.ustring.gsub(self:getCategoryName(), "Creole language", "Creole")
end



function Language:makeWikipediaLink()
	return "[[w:" .. self:getWikipediaArticle() .. "|" .. self:getCanonicalName() .. "]]"
end


function Language:getScripts()
	if not self._scriptObjects then
		local m_scripts = require("Module:scripts")
		self._scriptObjects = {}
		
		for _, sc in ipairs(self._rawData.scripts or { "None" }) do
			table.insert(self._scriptObjects, m_scripts.getByCode(sc))
		end
	end
	
	return self._scriptObjects
end

--[==[Returns the table of script codes in the language's data file.]==]
function Language:getScriptCodes()
	local scripts = self._scriptCodes
	if scripts == nil then
		scripts = self._rawData[4]
		if scripts then
			local codes, n = {}, 0
			for code in gmatch(scripts, "[^,]+") do
				n = n + 1
				-- Special handling of "Hants", which represents "Hani", "Hant" and "Hans" collectively.
				if code == "Hants" then
					codes[n] = "Hani"
					codes[n + 1] = "Hant"
					codes[n + 2] = "Hans"
					n = n + 2
				else
					codes[n] = code
				end
			end
			scripts = codes
		else
			scripts = {"None"}
		end
		self._scriptCodes = scripts
	end
	return scripts
end

--[==[Given some text, this function iterates through the scripts of a given language and tries to find the script that best matches the text. It returns a {{code|lua|Script}} object representing the script. If no match is found at all, it returns the {{code|lua|None}} script object.]==]
function Language:findBestScript(text, forceDetect)
	local useRequire = self._useRequire
	
	if not text or text == "" or text == "-" then
		return require("Module:scripts").getByCode("None", nil, nil, useRequire)
	end
	
	-- Differs from table returned by getScriptCodes, as Hants is not normalized into its constituents.
	local codes = self._bestScriptCodes
	if codes == nil then
		codes = self._rawData[4]
		codes = codes and split(codes, ",", true, true) or {"None"}
		self._bestScriptCodes = codes
	end
	
	local first_sc = codes[1]
	
	if first_sc == "All" then
		return require("Module:scripts").findBestScriptWithoutLang(text)
	end
	
	local get_script = require("Module:scripts").getByCode
	local codes_len = #codes
	
	if not (forceDetect or first_sc == "Hants" or codes_len > 1) then
		first_sc = get_script(first_sc, nil, nil, useRequire)
		local charset = first_sc.characters
		return charset and umatch(text, "[" .. charset .. "]") and first_sc or
			get_script("None", nil, nil, useRequire)
	end
	
	-- Remove all formatting characters.
	text = require("Module:utilities").get_plaintext(text)
	
	-- Remove all spaces and any ASCII punctuation. Some non-ASCII punctuation is script-specific, so can't be removed.
	text = ugsub(text, "[%s!\"#%%&'()*,%-./:;?@[\\%]_{}]+", "")
	if #text == 0 then
		return get_script("None", nil, nil, useRequire)
	end
	
	-- Try to match every script against the text,
	-- and return the one with the most matching characters.
	local bestcount, bestscript, length = 0
	for i = 1, codes_len do
		local sc = codes[i]
		-- Special case for "Hants", which is a special code that represents whichever of "Hant" or "Hans" best matches, or "Hani" if they match equally. This avoids having to list all three. In addition, "Hants" will be treated as the best match if there is at least one matching character, under the assumption that a Han script is desirable in terms that contain a mix of Han and other scripts (not counting those which use Jpan or Kore).
		if sc == "Hants" then
			local Hani = get_script("Hani", nil, nil, useRequire)
			if not Hant_chars then
				Hant_chars = self:loadData("Module:zh/data/ts")
				Hans_chars = self:loadData("Module:zh/data/st")
			end
			local t, s, found = 0, 0
			-- This is faster than using mw.ustring.gmatch directly.
			for ch in gmatch(ugsub(text, "[" .. Hani.characters .. "]", "\255%0"), "\255(.[\128-\191]*)") do
				found = true
				if Hant_chars[ch] then
					t = t + 1
					if Hans_chars[ch] then
						s = s + 1
					end
				elseif Hans_chars[ch] then
					s = s + 1
				else
					t, s = t + 1, s + 1
				end
			end
			
			if found then
				if t == s then
					return Hani
				end
				return get_script(t > s and "Hant" or "Hans", nil, nil, useRequire)
			end
		else
			sc = get_script(sc, nil, nil, useRequire)
			
			if not length then
				length = ulen(text)
			end
			
			-- Count characters by removing everything in the script's charset and comparing to the original length.
			local charset = sc.characters
			local count = charset and length - ulen(ugsub(text, "[" .. charset .. "]+", "")) or 0
			
			if count >= length then
				return sc
			elseif count > bestcount then
				bestcount = count
				bestscript = sc
			end
		end
	end
	
	-- Return best matching script, or otherwise None.
	return bestscript or get_script("None", nil, nil, useRequire)
end

--[==[Returns a <code>Family</code> object for the language family that the language belongs to. See [[Module:families]].]==]
function Language:getFamily()
	local family = self._familyObject
	if family == nil then
		family = self:getFamilyCode()
		-- If the value is nil, it's cached as false.
		family = family and require("Module:families").getByCode(family, self._useRequire) or false
		self._familyObject = family
	end
	return family or nil
end

--[==[Returns the family code in the language's data file.]==]
function Language:getFamilyCode()
	local family = self._familyCode
	if family == nil then
		-- If the value is nil, it's cached as false.
		family = self._rawData[3] or false
		self._familyCode = family
	end
	return family or nil
end

function Language:getFamilyName()
	local family = self._familyName
	if family == nil then
		family = self:getFamily()
		-- If the value is nil, it's cached as false.
		family = family and family:getCanonicalName() or false
		self._familyName = family
	end
	return family or nil
end

--[==[
If the language is etymology-only, this iterates through parents until a full language or family is found, and the
corresponding object is returned. If the language is a full language, then it simply returns itself.
]==]
function Language:getFull()
	local full = self._fullObject
	if full == nil then
		full = self:getFullCode()
		full = full == self._code and self or
			export.getByCode(full, nil, nil, nil, self._useRequire)
		self._fullObject = full
	end
	return full
end

--[==[
If the language is an etymology-only language, this iterates through parents until a full language or family is
found, and the corresponding code is returned. If the language is a full language, then it simply returns the
language code.
]==]
function Language:getFullCode()
	return self._fullCode or self._code
end

--[==[
If the language is an etymology-only language, this iterates through parents until a full language or family is
found, and the corresponding canonical name is returned. If the language is a full language, then it simply returns
the canonical name of the language.
]==]
function Language:getFullName()
	local full = self._fullName
	if full == nil then
		full = self:getFull():getCanonicalName()
		self._fullName = full
	end
	return full
end

function Language:getAncestors()
	if not self._ancestorObjects then
		self._ancestorObjects = {}
		
		if self._rawData.ancestors then
			for _, ancestor in ipairs(self._rawData.ancestors) do
				table.insert(self._ancestorObjects, export.getByCode(ancestor) or require("Module:etymology languages").getByCode(ancestor))
			end
		else
			local fam = self:getFamily()
			local protoLang = fam and fam:getProtoLanguage() or nil
			
			-- For the case where the current language is the proto-language
			-- of its family, we need to step up a level higher right from the start.
			if protoLang and protoLang:getCode() == self:getCode() then
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			while not protoLang and not (not fam or fam:getCode() == "qfa-not") do
				fam = fam:getFamily()
				protoLang = fam and fam:getProtoLanguage() or nil
			end
			
			table.insert(self._ancestorObjects, protoLang)
		end
	end
	
	return self._ancestorObjects
end

local function iterateOverAncestorTree(node, func)
	for _, ancestor in ipairs(node:getAncestors()) do
		if ancestor then
			local ret = func(ancestor) or iterateOverAncestorTree(ancestor, func)
			if ret then
				return ret
			end
		end
	end
end

function Language:getAncestorChain()
	if not self._ancestorChain then
		self._ancestorChain = {}
		local step = #self:getAncestors() == 1 and self:getAncestors()[1] or nil
		
		while step do
			table.insert(self._ancestorChain, 1, step)
			step = #step:getAncestors() == 1 and step:getAncestors()[1] or nil
		end
	end
	
	return self._ancestorChain
end


function Language:hasAncestor(otherlang)
	local function compare(ancestor)
		return ancestor:getCode() == otherlang:getCode()
	end
	
	return iterateOverAncestorTree(self, compare) or false
end


function Language:getCategoryName()
	local name = self._rawData.canonicalName
	
	-- If the name already has "language" in it, don't add it.
	if name:find("[Ll]anguage$") then
		return name
	else
		return name .. " language"
	end
end


function Language:getStandardCharacters()
	return self._rawData.standardChars
end


function Language:makeEntryName(text)
	text = mw.ustring.gsub(text, "^[¿¡]", "")
	text = mw.ustring.gsub(text, "(.)[؟?!;՛՜ ՞ ՟?!︖︕।॥။၊་།]$", "%1")
	
	if self:getCode() == "ar" then
		local U = mw.ustring.char
		local taTwiil = U(0x640)
		local waSla = U(0x671)
		-- diacritics ordinarily removed by entry_name replacements
		local Arabic_diacritics = U(0x64B, 0x64C, 0x64D, 0x64E, 0x64F, 0x650, 0x651, 0x652, 0x670)
		
		if text == waSla or mw.ustring.find(text, "^" .. taTwiil .. "?[" .. Arabic_diacritics .. "]" .. "$") then
			return text
		end
	end
	
	if type(self._rawData.entry_name) == "table" then
		for i, from in ipairs(self._rawData.entry_name.from) do
			local to = self._rawData.entry_name.to[i] or ""
			text = mw.ustring.gsub(text, from, to)
		end
	end
	
	--[=[ For instance, ᾰ (alpha-breve) + combining smooth breathing is converted
		to alpha + combining smooth breathing by the entry_name replacements.
		It must be re-combined to alpha-smooth breathing (ἀ) so that
		allowSelfLink in [[Module:links]] will work properly. ]=]
	if self:getCode() == "grc" then
		text = mw.ustring.toNFC(text)
	end
	
	return text
end


-- Add to data tables?
local has_dotted_undotted_i = {
	["az"] = true,
	["crh"] = true,
	["gag"] = true,
	["kaa"] = true,
	["tt"] = true,
	["tr"] = true,
	["zza"] = true,
}

--[==[Generates alternative forms using a specified method, and returns them as a table. If no method is specified, returns a table containing only the input term.]==]
function Language:generateForms(text, sc)
	if self._rawData.generate_forms then
		sc = checkScript(text, self, sc)
		return require("Module:" .. self._rawData.generate_forms).generateForms(text, self._code, sc:getCode())
	else
		return {text}
	end
end

function Language:makeSortKey(name, sc)
	if has_dotted_undotted_i[self:getCode()] then
		name = mw.ustring.gsub(name, "I", "ı")
	end
	
	name = mw.ustring.lower(name)
	
	-- Remove initial hyphens and *
	local hyphens_regex = "^[-־ـ*]+(.)"
	name = mw.ustring.gsub(name, hyphens_regex, "%1")
	
	-- If there are language-specific rules to generate the key, use those
	if type(self._rawData.sort_key) == "table" then
		for i, from in ipairs(self._rawData.sort_key.from) do
			local to = self._rawData.sort_key.to[i] or ""
			name = mw.ustring.gsub(name, from, to)
		end
	elseif type(self._rawData.sort_key) == "string" then
		name = require("Module:" .. self._rawData.sort_key).makeSortKey(name, self:getCode(), sc and sc:getCode())
	end
	
	-- Remove parentheses, as long as they are either preceded or followed by something
	name = mw.ustring.gsub(name, "(.)[()]+", "%1")
	name = mw.ustring.gsub(name, "[()]+(.)", "%1")
	
	if has_dotted_undotted_i[self:getCode()] then
		name = mw.ustring.gsub(name, "i", "İ")
	end
	
	return mw.ustring.upper(name)
end

--[==[Create the form used as as a basis for display text and transliteration.]==]
local function processDisplayText(text, self, sc, keepCarets, keepPrefixes)
	local subbedChars = {}
	text, subbedChars = doTempSubstitutions(text, subbedChars, keepCarets)

	text = decode_uri(text, "PATH")
	text = checkNoEntities(self, text)

	sc = checkScript(text, self, sc)
	local fail, cats
	text = normalize(text, sc)
	text, fail, cats, subbedChars = iterateSectionSubstitutions(text, subbedChars, keepCarets, self, sc, self._rawData.display_text, "makeDisplayText")

	text = removeCarets(text, sc)

	-- Remove any interwiki link prefixes (unless they have been escaped or this has been disabled).
	if text:match(":") and not keepPrefixes then
		local rep
		repeat
			text, rep = text:gsub("\\\\(\\*:)", "\3%1")
		until rep == 0
		text = text
			:gsub("\\:", "\4")
		while true do
			local prefix = text:gsub("^(.-):.+", function(m1)
				return m1:gsub("\244[\128-\191]*", "")
			end)
			if not prefix or prefix == text then
				break
			end
			local lower_prefix = prefix:ulower()
			if not (self:loadData("Module:data/interwikis")[lower_prefix] or prefix == "") then
				break
			end
			text = text:gsub("^(.-):(.*)", function(m1, m2)
				local ret = {}
				for subbedChar in gmatch(m1, "\244[\128-\191]*") do
					insert(ret, subbedChar)
				end
				return concat(ret) .. m2
			end)
		end
		text = text
			:gsub("\3", "\\")
			:gsub("\4", ":")
	end

	return text, fail, cats, subbedChars
end

--[==[Make the display text (i.e. what is displayed on the page).]==]
function Language:makeDisplayText(text, sc, keepPrefixes)
	if (not text) or text == "" then
		return text, nil, {}
	end
	
	local fail, cats, subbedChars
	text, fail, cats, subbedChars = processDisplayText(text, self, sc, nil, keepPrefixes)

	text = escape_risky_characters(text)
	return undoTempSubstitutions(text, subbedChars), fail, cats
end

function Language:overrideManualTranslit()
	if self._rawData.override_translit then
		return true
	else
		return false
	end
end


function Language:transliterate(text, sc, module_override)
	if not ((module_override or self._rawData.translit_module) and text) then
		return nil
	end
	
	if module_override then
		require("Module:debug").track("module_override")
	end
	
	return require("Module:" .. (module_override or self._rawData.translit_module)).tr(text, self:getCode(), sc and sc:getCode() or nil)
end

function Language:hasTranslit()
	return self._rawData.translit_module and true or false
end


function Language:link_tr()
	return self._rawData.link_tr and true or false
end


function Language:toJSON()
	local entryNamePatterns = nil
	
	if self._rawData.entry_name then
		entryNamePatterns = {}
		
		for i, from in ipairs(self._rawData.entry_name.from) do
			local to = self._rawData.entry_name.to[i] or ""
			table.insert(entryNamePatterns, { from = from, to = to })
		end
	end
	
	local ret = {
		ancestors = self._rawData.ancestors,
		canonicalName = self:getCanonicalName(),
		categoryName = self:getCategoryName(),
		code = self._code,
		entryNamePatterns = entryNamePatterns,
		family = self._rawData.family,
		otherNames = self:getOtherNames(),
		scripts = self._rawData.scripts,
		type = self:getType(),
		wikimediaLanguages = self._rawData.wikimedia_codes,
	}
	
	return require("Module:JSON").toJSON(ret)
end


-- Do NOT use this method!
-- All uses should be pre-approved on the talk page!
function Language:getRawData()
	return self._rawData
end

Language.__index = Language


function export.getDataModuleName(code)
	if code:find("^[a-z][a-z]$") then
		return "languages/data2"
	elseif code:find("^[a-z][a-z][a-z]$") then
		local prefix = code:sub(1, 1)
		return "languages/data3/" .. prefix
	elseif code:find("^[a-z-]+$") then
		return "languages/datax"
	else
		return nil
	end
end


local function getRawLanguageData(code)
	local modulename = export.getDataModuleName(code)
	return modulename and mw.loadData("Module:" .. modulename)[code] or nil
end


function export.makeObject(code, data)
	if data and data.deprecated then
		require("Module:debug").track {
			"languages/deprecated",
			"languages/deprecated/" .. code
		}
	end
	
	return data and setmetatable({ _rawData = data, _code = code }, Language) or nil
end

--[==[Finds the language whose code matches the one provided. If it exists, it returns a <code class="nf">Language</code> object representing the language. Otherwise, it returns {{code|lua|nil}}, unless <code class="n">paramForError</code> is given, in which case an error is generated. If <code class="n">paramForError</code> is {{code|lua|true}}, a generic error message mentioning the bad code is generated; otherwise <code class="n">paramForError</code> should be a string or number specifying the parameter that the code came from, and this parameter will be mentioned in the error message along with the bad code. If <code class="n">allowEtymLang</code> is specified, etymology-only language codes are allowed and looked up along with normal language codes. If <code class="n">allowFamily</code> is specified, language family codes are allowed and looked up along with normal language codes.]==]
function export.getByCode(code, paramForError, allowEtymLang, allowFamily, useRequire)
	if type(code) ~= "string" then
		local typ
		if not code then
			typ = "nil"
		elseif check_object("language", true, code) then
			typ = "a language object"
		elseif check_object("family", true, code) then
			typ = "a family object"
		else
			typ = "a " .. type(code)
		end
		error("The function getByCode expects a string as its first argument, but received " .. typ .. ".")
	end

	local function conditionalRequire(modulename)
		if useRequire then
			return require(modulename)
		else
			return load_data(modulename)
		end
	end

	-- FIXME: Temporary. Lists bad codes to track, so we can consider eliminating them.
	-- We list them directly here rather than in a separate module (cf. [[Module:etymology languages/track-bad-etym-code]])
	-- in the hope that this reduces memory usage as we have to do this for every invocation of getByCode() for every
	-- language code.
	local codes_to_track = {
		-- Codes duplicated been full and etymology-only languages
		["bsg"] = true,
		["rdb"] = true,
		["tgf"] = true,
		-- Aliases actively being deprecated
		["prv"] = true, -- oc-pro
		-- Codes that will be converted to families
		["nan"] = true,
		-- Codes being renamed
		["cmn-wadegile"] = true,
		["wuu-ngb"] = true,
		["wuu-hzh"] = true,
		["wuu-szh"] = true,
	}

	local function track_bad_code(code)
		if codes_to_track[code] then
			track(code)
		end
		return true
	end

	local modulename = export.getDataModuleName(code)

	local function get_data(code)
		return modulename and
			track_bad_code(code) and conditionalRequire("Module:" .. modulename)[code] or
			(allowEtymLang and require("Module:etymology languages/track-bad-etym-code")(code) and conditionalRequire("Module:etymology languages/data")[code]) or
			(allowFamily and conditionalRequire("Module:families/data")[code]) or
			(allowEtymLang and allowFamily and require("Module:families/track-bad-etym-code")(code) and conditionalRequire("Module:families/data/etymology")[code])
	end

	local data = get_data(code) or get_data(normalize_code(code))

	local retval = code and data and export.makeObject(code, data, useRequire)

	if not retval and paramForError then
		require("Module:languages/errorGetBy").code(code, paramForError, allowEtymLang, allowFamily)
	end

	return retval
end

function export.getByName(name)
	local byName = mw.loadData("Module:languages/by name")
	local code = byName.all and byName.all[name] or byName[name]
	
	if not code then
		return nil
	end
	
	return export.makeObject(code, getRawLanguageData(code))
end


function export.getByCanonicalName(name, errorIfInvalid, allowEtymLang, allowFamily, useRequire)
	local function conditionalRequire(modulename)
		if useRequire then
			return require(modulename)
		else
			return load_data(modulename)
		end
	end

	local byName = conditionalRequire("Module:languages/canonical names")
	local code = byName and byName[name]

	if not code and allowEtymLang then
		byName = conditionalRequire("Module:etymology languages/canonical names")
		code = byName and byName[name] or
			byName[name:gsub(" [Ss]ubstrate$", "")] or
			byName[name:gsub("^a ", "")] or
			byName[name:gsub("^a ", ""):gsub(" [Ss]ubstrate$", "")] or
			-- For etymology families like "ira-pro".
			-- FIXME: This is not ideal, as it allows " languages" to be appended to any etymology-only language, too.
			byName[name:match("^(.*) languages$")]
	end

	if not code and allowFamily then
		byName = conditionalRequire("Module:families/canonical names")
		code = byName and byName[name] or
			byName[name:match("^(.*) languages$")]
	end

	local retval = code and export.getByCode(code, errorIfInvalid, allowEtymLang, allowFamily, useRequire)

	if not retval and errorIfInvalid then
		require("Module:languages/errorGetBy").canonicalName(name, allowEtymLang, allowFamily)
	end

	return retval
end

--[==[Used by [[Module:languages/data/2]] (et al.) to add default types to the entities returned.]==]
function export.addDefaultTypes(data, regular, ...)
	local n = arg.n
	local types = n > 0 and concat(arg, ",") or ""
	for _, entity in next, data do
		-- "regular" encompasses everything that doesn't have another type already assigned.
		if regular then
			entity.type = entity.type or "regular"
		end
		if n > 0 then
			entity.type =  types .. (entity.type and ("," .. entity.type) or "")
		end
	end
	return data
end


function export.iterateAll()
	mw.incrementExpensiveFunctionCount()
	local m_data = mw.loadData("Module:languages/alldata")
	local func, t, var = pairs(m_data)
	
	return function()
		local code, data = func(t, var)
		return export.makeObject(code, data)
	end
end

--[==[Used by [[Module:languages/data/2]] (et al.) and [[Module:etymology languages/data]] to finalize language-related data into the format that is actually returned.]==]
function export.finalizeLanguageData(data)
	-- 4 is scripts.
	local fields = {4, "ancestors", "type", "wikimedia_codes"}
	local fields_len = #fields
	for _, entity in next, data do
		for i = 1, fields_len do
			local key = fields[i]
			local field = entity[key]
			if field then
				entity[key] = gsub(field, "%s+", "")
			end
		end
	end
	return data
end

--[==[For backwards compatibility only; modules should require the error themselves.]==]
function export.err(lang_code, param, code_desc, template_tag, not_real_lang)
	return require("Module:languages/error")(lang_code, param, code_desc, template_tag, not_real_lang)
end

return export